# RAG with GDELT data

## sources
- GDELT data [here](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/)
- llama index [github](https://github.com/run-llama/llama_index)
- following this [tutorial from AV](https://www.analyticsvidhya.com/blog/2023/10/rag-pipeline-with-the-llama-index/)
  - and [this one from medium](https://medium.com/@sandyludosky/rag-and-internet-browsing-eng-56ac9bb073a9)

In [14]:
import streamlit as st
import requests
from bs4 import BeautifulSoup
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pandas as pd
from langchain_community.llms import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
import os
import json

from llama_index.core import ServiceContext, PromptHelper, VectorStoreIndex, SimpleDirectoryReader, set_global_service_context 
from llama_index.llms.openai import OpenAI
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.node_parser import SimpleNodeParser

from tqdm import tqdm
from trafilatura.sitemaps import sitemap_search
from trafilatura import extract_metadata

## API KEYS
import openai
from openai import OpenAI
openai.organization = "org-raWgaVqCbuR9YlP1CIjclYHk" # Harvard
openai.api_key = os.getenv("OPENAI_API_KEY")

print(True if openai.api_key else False)

True


In [2]:
## GRABBING THE DATA FROM GDELT 
import requests
import json

# def get_gdelt_data(query):
#     url = "https://api.gdeltproject.org/api/v2/doc/doc?query=(%22islamic%20state%22%20OR%20isis%20OR%20somalia)&mode=artlist&maxrecords=100&timespan=1week&format=JSON"
#     try:
#         response = requests.get(url)
#         response.raise_for_status()
#         return response.json()
#     except requests.exceptions.RequestException as e:
#         print(f"Request failed: {e}")
#     except json.JSONDecodeError as e:
#         print(f"JSON decode error: {e.msg}")
#         raise json.JSONDecodeError(e.msg, e.doc, e.pos)

def get_gdelt_data(query, start_date, end_date):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    print(query)
    lang_query = f"{query} sourcelang:english"
    params = {
        "query": lang_query,
        "mode": "artlist",
        "format": "json",
        "startdatetime": start_date.strftime("%Y%m%d%H%M%S"),
        "enddatetime": end_date.strftime("%Y%m%d%H%M%S"),
        "maxrecords": 10,
    }
    response = requests.get(base_url, params=params).json()
    urls = [article["url"] for article in response.get("articles", [])]
    return urls

# TODO make this legit responsive 
query = "sudan"
end_date = datetime.now()
start_date = end_date - timedelta(days=30) 
gdelt_data = get_gdelt_data(query, start_date, end_date)
print(json.dumps(gdelt_data, indent=4))


sudan
[
    "https://www.yenisafak.com/en/news/uae-contributes-25m-to-un-for-humanitarian-efforts-in-sudan-south-sudan-3686356",
    "https://news.webindia123.com/news/Articles/World/20240624/4208555.html",
    "https://gulfnews.com/uae/uae-contributes-25-million-to-un-for-humanitarian-efforts-in-sudan-1.103250476",
    "https://allafrica.com/stories/202406230024.html",
    "https://www.radiotamazuj.org/en/news/article/muslim-leader-calls-on-kiir-to-mediate-peace-in-sudan",
    "https://www.standardmedia.co.ke/opinion/article/2001496412/south-sudan-can-unlock-its-potential-with-the-help-of-investors",
    "https://www.standardmedia.co.ke/business/article/2001496412/south-sudan-can-unlock-its-potential-with-the-help-of-investors",
    "https://www.marketscreener.com/news/latest/Sudan-oil-pipeline-resumption-imminent-says-South-Sudan-official-46842113/",
    "https://reliefweb.int/report/sudan/sudan-over-1500-children-subjected-extreme-violence-conflict-breaks-records-crimes-against-chil

In [3]:
# ## RETRIEVING THE URLS N READING
# urls = [article["url"] for article in gdelt_data.get("articles", [])]
# print(urls)
# print(len(urls)) # usually 100

# test = urls[:20]

urls = gdelt_data

In [4]:
def create_dataset(list_of_websites: list) :
    """
    scrapes the data from the list of websites
    """
    data = []
    for url in tqdm(list_of_websites, desc="urls"):
        try:
            # Send HTTP request to the URL
            response = requests.get(url)
            response.raise_for_status()  # Check for successful response
            # Parse HTML content
            soup = BeautifulSoup(response.content, "html.parser")
            metadata = extract_metadata(response.content)
            title = soup.title.string
            description = metadata.description
            # Extract text from each paragraph
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = "\n".join(paragraphs)
            d = {
                "url": url,
                "title": title,
                "body": content,
                "description": description,
            }
            data.append(d)
        except requests.exceptions.HTTPError as errh:
            print(f"HTTP Error: {errh}")
        except requests.exceptions.ConnectionError as errc:
            print(f"Error Connecting: {errc}")
        except requests.exceptions.Timeout as errt:
            print(f"Timeout Error: {errt}")
        except requests.RequestException as err:
            print(f"Error during requests to {url}: {str(err)}")
    return data

def scrape(list_of_websites: list) -> None:
    data = create_dataset(list_of_websites)

    current_time = datetime.now().strftime("%d%H%M%S")
    dataset_filename = f"./data/dataset_{current_time}.txt"

    with open(dataset_filename, "w", encoding="utf-8") as file:
        for paragraph in data:
            file.write("\n" + paragraph["title"] + "\n")
            file.write(paragraph["body"]+"\n\n")

In [5]:
## CREATE DOCUMENT SET
scrape(urls)

urls:  10%|█         | 1/10 [00:00<00:00,  9.54it/s]

HTTP Error: 422 Client Error:  for url: https://www.yenisafak.com/en/news/uae-contributes-25m-to-un-for-humanitarian-efforts-in-sudan-south-sudan-3686356


urls:  80%|████████  | 8/10 [00:08<00:02,  1.23s/it]

HTTP Error: 403 Client Error: Forbidden. for url: https://www.marketscreener.com/news/latest/Sudan-oil-pipeline-resumption-imminent-says-South-Sudan-official-46842113/


urls: 100%|██████████| 10/10 [00:10<00:00,  1.10s/it]


In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.character import CharacterTextSplitter
import os

## FRAGMENTING DOCUMENTS
def split_documents():
    """Load the most recent file from the data folder, split it into chunks, embed each chunk and load it into the vector store."""
    data_folder = "./data"
    files = os.listdir(data_folder)
    latest_file = max([os.path.join(data_folder, f) for f in files], key=os.path.getctime)
    raw_documents = TextLoader(latest_file).load()
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
    return text_splitter.split_documents(raw_documents)

docs = split_documents()

Created a chunk of size 6513, which is longer than the specified 100
Created a chunk of size 2249, which is longer than the specified 100
Created a chunk of size 8380, which is longer than the specified 100
Created a chunk of size 153, which is longer than the specified 100
Created a chunk of size 1468, which is longer than the specified 100
Created a chunk of size 18271, which is longer than the specified 100
Created a chunk of size 18271, which is longer than the specified 100
Created a chunk of size 5297, which is longer than the specified 100


In [7]:
import os
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import (
    CharacterTextSplitter,
)
from langchain.prompts.chat import (
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma

In [15]:
## VECTOR STORE 
def load_embeddings(documents):
    """Create a vector store from a set of documents."""
    db = Chroma.from_documents(documents, OpenAIEmbeddings())
    # docs = db.similarity_search(user_query)
    return db
    # return db.as_retriever()

vector_db = load_embeddings(docs)
print(vector_db.similarity_search("islamic state"))


[Document(page_content='Muslim leader calls on Kiir to mediate peace in Sudan - Radio Tamazuj\nRadio Tamazuj\nIndependent News Crossing Borders\nSheikh Abdalah Barac. (File photo)', metadata={'source': './data/dataset_24140820.txt'}), Document(page_content='Muslim leader calls on Kiir to mediate peace in Sudan - Radio Tamazuj\nRadio Tamazuj\nIndependent News Crossing Borders\nSheikh Abdalah Barac. (File photo)', metadata={'source': './data/dataset_24140820.txt'}), Document(page_content='The Secretary-General of the Islamic Council of South Sudan has appealed to the South Sudanese President Salva Kiir to mediate to resolve the dispute between the warring parties in Sudan to bring peace and lasting stability.\nSpeaking to Radio Tamzuj on Sunday in Juba, Sheikh Abdallah Barac, described the president as the right person to arbitrate the conflict in Sudan.\n“I appeal to President Kiir to have an effective role in bringing about peace in the state of Sudan and to bring all parties to the co