In [11]:
# !source /content/drive/MyDrive/GEN_AI_2/venv/bin/activate; pip3 install --quiet --upgrade langchain-text-splitters langchain-community langgraph langchain-google-genai "langchain-chroma>=0.1.2"

# Generative AI 2 Project: RAG System

## Project Setup

### Dependencies

In [12]:
import os
from dotenv import load_dotenv
import bs4
from uuid import uuid4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain import hub
from typing_extensions import List, TypedDict
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph

### Loading Environment Variables

In [13]:
load_dotenv()

True

## Retrieval

### Loading Webpage Article

In [14]:
bs4_strainer = bs4.SoupStrainer(class_=("BlogItem-title", "col sqs-col-12 span-12"))
loader = WebBaseLoader(
    web_paths=("https://datareportal.com/reports/digital-2025-global-overview-report",),
    bs_kwargs={"parse_only": bs4_strainer},
    encoding="utf-8",
)
docs = loader.load()

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 141159


### Text Extraction Validation Test

In [15]:
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")
print(docs[0].page_content[:500])

Total characters: 141159
Digital 2025: Global Overview ReportADVERTISEMENT







2025 looks set to be another bumper year in digital, with rapid advances in AI, the evolving social media landscape, and broadening online behaviours all gearing up to reshape our connected lives over the coming months.With so much going on, the pace of digital trends can feel overwhelming, but our enormous new Digital 2025 Global Overview Report – published in partnership between We Are Social and Meltwater – has all the stats, insights, 


### Text Splitting

In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 226 sub-documents.


In [17]:
print(all_splits[5].page_content)

a later date. Because of this, we strongly encourage you to explore the timeline charts available in our local country reports, which will provide more informed insight into the evolution of digital adoption and behaviour. Note that our Digital 2025 local country reports will be available to read in full – and for free – on the DataReportal website at the end of February 2025, but you can already explore data for previous years via our complete online library.The United Nations published a comprehensive update to its World Population Prospects data in July 2024, and this update included some important revisions to population numbers around the world. In addition to impacting the figures that we report for population, these revisions may also affect any data point where we show digital adoption or activity as a percentage of the total population (e.g. internet and social media adoption), or as a percentage of specific age groups within the broader population (e.g. people aged 18 and


### Storing Text Chunks

In [18]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=os.getenv('GEMINI_API_KEY'))

vector_store = Chroma(
    collection_name="text_collection",
    embedding_function=embeddings,
    persist_directory="./chroma",  # Where to save data locally, remove if not necessary
)
uuids = [str(uuid4()) for _ in range(len(all_splits))]
vector_store.add_documents(documents=all_splits, ids=uuids)

['636dfb11-f853-45b0-9ee6-160a782287f9',
 '8bd2b996-aba2-4792-a96e-08d6f620f4a4',
 '5bbefa3e-7fdd-4f97-bd59-8be9a1fd607f',
 '28e49d6b-1f4c-4579-8c9e-52fb859cbcff',
 '9d0d9713-8d88-4501-a9ff-a895d5162c65',
 '56098677-96f1-4feb-b0db-bc96ce2d7ce4',
 'a52c0111-e023-421c-9aad-4bfd2096d4ba',
 'ee026569-0ebf-4f7d-bf15-648ef3a686bf',
 '03f22009-c6df-4102-9ac4-016e123c2493',
 '374df7a3-bb9b-4ebb-9aee-fbe740296f91',
 '3764193c-1dab-4545-b260-91bea4cdc278',
 'b16ea068-2143-4f9b-8028-396706b69035',
 '86ee3503-35f7-43a9-94ec-4c26fcc547fe',
 'b5066c7e-0759-41cc-980b-b26fc8490a5f',
 'f4140449-4ce6-4e06-9045-0a6e7ed72a9e',
 '32b4d9cb-f0d5-409f-9bfe-9ce7bc3acb6f',
 'f8dc0cb2-d7b1-4cc1-8914-c4f75a7c485c',
 '6215b7a5-d200-4d46-a2fc-aa87f7c3d61d',
 '6f125dec-6ffd-4e51-bb73-14ee57c54d9d',
 '385d66b8-f102-436f-b030-5842ef5737ed',
 '12447f4a-bd6f-4958-8f38-f93225a7d0d7',
 'be955f1b-069f-4dce-96f4-18ab5fa8c9b9',
 'e241df8b-ffe6-47d8-83ab-f980f3518a48',
 'c0e00e4b-cb4a-48aa-9dbc-4d14d42ca16b',
 'db0c0433-ea3a-

In [19]:
results = vector_store.similarity_search(
    "tiktok adult reach",
)

## Generation

### Prompt Template

In [20]:
template = """You are a helpful assistant that answers questions about the Digital 2025 Global Overview Report.
You are only allowed to use the following pieces of context to formulate your answer. If the answer to the question
lies outside of the provided context, simply state that you don't know, and don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.

Chat History: {chat_history}

Question: {question}

Contetxt: {context}

Helpful Answer:"""

prompt = PromptTemplate.from_template(template)

In [21]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2, 
    google_api_key=os.getenv('GEMINI_API_KEY')
)

In [22]:
class State(TypedDict):
    question: str
    context: List[Document]
    chat_history: List[str]
    answer: str

In [23]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    question = state["question"]
    context = "\n\n".join(doc.page_content for doc in state["context"])
    chat_history = '\n'.join(state["chat_history"])
    messages = prompt.invoke({"question": question, "context": context, "chat_history": chat_history})
    response = llm.invoke(messages)
    return {"answer": response.content}

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

## Question/Answer Testing

### Question 1

In [24]:
result = graph.invoke({"question": "how has the median cellular data transfer rate in Myanmar changed in the past 12 months?", "chat_history":""})

print(f'Answer: {result["answer"]}')

Answer: The median cellular data transfer rate in Myanmar has fallen by almost 80 percent over the past 12 months. Mobile users in Myanmar now struggle with a median bandwidth of just 5.09 Mbps.


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('422 Client Error: unknown for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unprocessable entity: invalid JSON part for patch.a84b42ef-9a62-43fb-be2d-14799e55df24: could not unmarshal run: unexpected end of JSON input"}\n')


### Question 2

In [25]:
result = graph.invoke({"question": "how many users do youtube ads reach each month?", "chat_history":""})

print(f'Answer: {result["answer"]}')

Answer: YouTube ads reach at least 2.53 billion users each month across 88 countries. This figure has increased by 40 million compared to the previous year. However, the numbers hadn't been updated since October at the time of report production.


### Question 3

In [26]:
result = graph.invoke({"question": "Are older people more likely to follow meme, and parody accounts?", "chat_history":""})

print(f'Answer: {result["answer"]}')

Answer: No, older people are less likely to follow meme and parody accounts. The appeal of these accounts drops off quickly as people get older. This category doesn’t even appear in the top ten amongst social media users aged 55 and above.


### Question 4

In [27]:
result = graph.invoke({"question": "some claim that facebook maybe 'dying as a platform, does the data that facebook still attracts users?", "chat_history":""})
print(f'Answer: {result["answer"]}')

Answer: The data indicates that Facebook remains the world’s second most used social media platform at the start of 2025. Third-party intelligence suggests that the platform still attracts 2.3 billion users to its mobile app each month. Meta’s data suggests that Facebook ads still reach more than 2¼ billion adults each month.


### Question 5

In [29]:
result = graph.invoke({"question": "what are the top ranking platforms for brand research in 2025?", "chat_history":""})
print(f'Answer: {result["answer"]}')

Answer: The top social platforms for brand research are Instagram, Facebook, and TikTok. Instagram ranks first, with 62.3 percent of its active adult users utilizing the platform for brand and product research. Facebook ranks second at 52.5 percent, and TikTok ranks third at 51.5 percent.


## Dialog Flow

In [None]:
def dialog():
    print("Hello, i am here to help you answer your questions about the Digital 2025 Global Overview Report. How may i help you?\n")
    chat_history = []
    while True:
        prompt = input("prompt: ")
        result = graph.invoke({"question": "what are the top ranking platforms for brand research in 2025?", "chat_history":chat_history})
        print(f'Answer: {result["answer"]}')
        print("Do you have any more questions? (Y/N)")
        exit_dialog = input("continue? ")
        if exit_dialog.lower() == 'n':
            break
        chat_history.append("prompt: " + prompt + "\nanswer: " + result["answer"])
    pass

dialog()

Hello, i am here to help you answer your questions about the Digital 2025 Global Overview Report. How may i help you?



## Langsmith Link:
https://smith.langchain.com/o/c4e1bf8b-29f8-4eb7-ba10-63a709b30eb0/