# RAG with GDELT data

## sources
- GDELT data [here](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/)
- llama index [github](https://github.com/run-llama/llama_index)
- following this [tutorial from AV](https://www.analyticsvidhya.com/blog/2023/10/rag-pipeline-with-the-llama-index/)
  - and [this one from medium](https://medium.com/@sandyludosky/rag-and-internet-browsing-eng-56ac9bb073a9)

In [42]:
## IMPORTS 
import os
from llama_index.core import ServiceContext, PromptHelper, VectorStoreIndex, SimpleDirectoryReader, set_global_service_context 
# LLMPredictor, OpenAIEmbedding,
from llama_index.llms.openai import OpenAI
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.node_parser import SimpleNodeParser

## FOR THE WEB SCRAPING
from tqdm import tqdm
from trafilatura.sitemaps import sitemap_search
from trafilatura import extract_metadata
import requests
from bs4 import BeautifulSoup

## API KEYS
import openai
from openai import OpenAI
openai.organization = "org-raWgaVqCbuR9YlP1CIjclYHk" # Harvard
openai.api_key = os.getenv("OPENAI_API_KEY")

print(True if openai.api_key else False)

True


In [43]:
## GRABBING THE DATA FROM GDELT 
import requests
import json

def fetch_gdelt_data(query):
    url = "https://api.gdeltproject.org/api/v2/doc/doc?query=(%22islamic%20state%22%20OR%20isis%20OR%20somalia)&mode=artlist&maxrecords=100&timespan=1week&format=JSON"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e.msg}")
        raise json.JSONDecodeError(e.msg, e.doc, e.pos)

# TODO make this legit responsive 
# test case is somalia 
query = "islamic state"
gdelt_data = fetch_gdelt_data(query)
print(json.dumps(gdelt_data, indent=4))


{
    "articles": [
        {
            "url": "https://abcnews.go.com/International/top-isis-leader-somalia-target-us-airstrike/story?id=111160552",
            "url_mobile": "https://abcnews.go.com/amp/International/top-isis-leader-somalia-target-us-airstrike/story?id=111160552",
            "title": "Top ISIS leader in Somalia was target of US airstrike",
            "seendate": "20240616T040000Z",
            "socialimage": "https://i.abcnewsfe.com/a/545c2681-242a-4a63-9856-658269eb5da4/MQ-9Reaper-ht-ml-231108_1699473054499_hpMain_16x9.jpg",
            "domain": "abcnews.go.com",
            "language": "English",
            "sourcecountry": "United States"
        },
        {
            "url": "https://www.washingtonexaminer.com/policy/defense/3046728/us-islamic-state-group-leader-airstrike-last-month/",
            "url_mobile": "",
            "title": "US targeted Islamic State group leader in airstrike last month : Report",
            "seendate": "20240616T010000Z",
   

In [44]:
## RETRIEVING THE URLS N READING
urls = [article["url"] for article in gdelt_data.get("articles", [])]
print(urls)
print(len(urls)) # usually 100

test = urls[:20]

['https://abcnews.go.com/International/top-isis-leader-somalia-target-us-airstrike/story?id=111160552', 'https://www.washingtonexaminer.com/policy/defense/3046728/us-islamic-state-group-leader-airstrike-last-month/', 'https://ktbb.com/post/?p=1331071', 'https://www.jpost.com/breaking-news/article-806574', 'https://live.jpost.com/breaking-news/article-806574', 'https://www.kdks.fm/syndicated-article/?id=1569647', 'https://www.wjol.com/syndicated-article/?id=1569647', 'http://www.northwestmoinfo.com/syndicated-article/?id=1569647', 'http://www.k101fm.net/syndicated-article/?id=1569647', 'https://hitsfm.net/world/8a9fce7898186b3fd53fa13d70c1b7a4', 'https://wondradio.com/abc-world/f37d46cfc9c6db39330d6717c0122576', 'https://kxel.com/2024/06/16/top-isis-leader-in-somalia-was-target-of-us-airstrike/', 'http://www.wjnt.com/syndicated-article/?id=1569647', 'https://www.mymoinfo.com/syndicated-article/?id=1569647', 'https://wadk.com/world-news/9fe53c802b0586b5e624a1c33dd7c0fb', 'http://www.cent

In [64]:
def create_dataset(list_of_websites: list) :
    """
    scrapes the data from the list of websites
    """
    data = []
    for url in tqdm(list_of_websites, desc="urls"):
        try:
            # Send HTTP request to the URL
            response = requests.get(url)
            response.raise_for_status()  # Check for successful response
            # Parse HTML content
            soup = BeautifulSoup(response.content, "html.parser")
            metadata = extract_metadata(response.content)
            title = soup.title.string
            description = metadata.description
            # Extract text from each paragraph
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = "\n".join(paragraphs)
            d = {
                "url": url,
                "title": title,
                "body": content,
                "description": description,
            }
            data.append(d)
        except requests.exceptions.HTTPError as errh:
            print(f"HTTP Error: {errh}")
        except requests.exceptions.ConnectionError as errc:
            print(f"Error Connecting: {errc}")
        except requests.exceptions.Timeout as errt:
            print(f"Timeout Error: {errt}")
        except requests.RequestException as err:
            print(f"Error during requests to {url}: {str(err)}")
    return data

def scrape(list_of_websites: list) -> None:
    data = create_dataset(list_of_websites)
    with open("./data/dataset.txt", "w", encoding="utf-8") as file:
        for paragraph in data:
            file.write("\n" + paragraph["title"] + "\n")
            file.write(paragraph["body"]+"\n\n")

In [66]:
## CREATE DOCUMENT SET
scrape(test)

urls:  60%|██████    | 12/20 [00:03<00:02,  3.12it/s]

HTTP Error: 403 Client Error: Forbidden for url: https://kxel.com/2024/06/16/top-isis-leader-in-somalia-was-target-of-us-airstrike/


urls: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


In [71]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.character import CharacterTextSplitter

## FRAGMENTING DOCUMENTS
def split_documents():
    """Load a file from path, split it into chunks, embed each chunk and load it into the vector store."""
    raw_documents = TextLoader("./data/dataset.txt").load()
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
    return text_splitter.split_documents(raw_documents)

docs = split_documents()

Created a chunk of size 1116, which is longer than the specified 100
Created a chunk of size 469, which is longer than the specified 100
Created a chunk of size 722, which is longer than the specified 100
Created a chunk of size 2131, which is longer than the specified 100
Created a chunk of size 2211, which is longer than the specified 100
Created a chunk of size 1170, which is longer than the specified 100
Created a chunk of size 1170, which is longer than the specified 100
Created a chunk of size 2183, which is longer than the specified 100
Created a chunk of size 134, which is longer than the specified 100
Created a chunk of size 2183, which is longer than the specified 100
Created a chunk of size 133, which is longer than the specified 100
Created a chunk of size 2183, which is longer than the specified 100
Created a chunk of size 141, which is longer than the specified 100
Created a chunk of size 2183, which is longer than the specified 100
Created a chunk of size 135, which is l

In [68]:
import os
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import (
    CharacterTextSplitter,
)
from langchain.prompts.chat import (
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma

In [76]:
## VECTOR STORE 
def load_embeddings(documents):
    """Create a vector store from a set of documents."""
    db = Chroma.from_documents(documents, OpenAIEmbeddings())
    # docs = db.similarity_search(user_query)
    return db
    # return db.as_retriever()

vector_db = load_embeddings(docs)
print(vector_db.similarity_search("islamic state"))


[Document(page_content='Posted Sunday morning\n(SOMALIA) -- A United States military airstrike in Somalia targeted the top Islamic State leader in late May, but it remains unclear if the leader was killed in the airstrike, according to U.S. officials.\nAbdulqadir Mumin has beenidentifiedby the U.S. as the head of the Islamic State in Somalia, an African affiliate of the terror group once known as ISIS.\nAstatementfrom U.S. Africa Command disclosed that on May 31 it had conducted an airstrike in a remote area of northeastern Somalia, south of the Gulf of Aden, that killed three militants affiliated with the Islamic State. The statement did not provide more accurate information about who was specifically being targeted in the airstrike.\nThree U.S. officials confirmed to ABC News that the target of the strike was the top leader of the Islamic State in Somalia whom the National Counterterrorism Center (NCTC) identified as Mumin. However, the U.S. officials said it remained unclear if Mumi