In [1]:
## IMPORTS
import streamlit as st
import requests
from bs4 import BeautifulSoup
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pandas as pd
from langchain_community.llms import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.character import CharacterTextSplitter
import os
import json

from llama_index.core import ServiceContext, PromptHelper, VectorStoreIndex, SimpleDirectoryReader, set_global_service_context 
from llama_index.llms.openai import OpenAI
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.node_parser import SimpleNodeParser

from tqdm import tqdm
from trafilatura.sitemaps import sitemap_search
from trafilatura import extract_metadata

## API KEYS
import openai
from openai import OpenAI
openai.organization = "org-raWgaVqCbuR9YlP1CIjclYHk" # Harvard
openai.api_key = os.getenv("OPENAI_API_KEY")
print("\033[92mOPENAI API KEY DETECTED\033[0m" if openai.api_key else "\033[91mNO API KEY DETECTED\033[0m")

# Your existing functions (create_dataset, scrape) remain unchanged

def get_gdelt_data(query, start_date, end_date):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    print(query)
    lang_query = f"{query} sourcelang:english"
    params = {
        "query": lang_query,
        "mode": "artlist",
        "format": "json",
        "startdatetime": start_date.strftime("%Y%m%d%H%M%S"),
        "enddatetime": end_date.strftime("%Y%m%d%H%M%S"),
        "maxrecords": 5,
    }
    response = requests.get(base_url, params=params).json()
    urls = [article["url"] for article in response.get("articles", [])]
    return urls, response

def create_dataset(list_of_websites: list) :
    """
    scrapes the data from the list of websites
    """
    data = []
    print(list_of_websites)
    for url in tqdm(list_of_websites, desc="urls"):
        try:
            # Send HTTP request to the URL with a timeout of 8 seconds
            response = requests.get(url, timeout=5)
            response.raise_for_status()  # Check for successful response
            # Parse HTML content
            print("are you stuck here")
            
            soup = BeautifulSoup(response.content, "html.parser")
            print("or here")
            
            metadata = extract_metadata(response.content)
            title = soup.title.string
            description = metadata.description
            # Extract text from each paragraph
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = "\n".join(paragraphs)
            d = {
                "url": url,
                "title": title,
                "body": content,
                "description": description,
            }
            data.append(d)
        except requests.exceptions.HTTPError as errh:
            print(f"HTTP Error: {errh}")
        except requests.exceptions.ConnectionError as errc:
            print(f"Error Connecting: {errc}")
        except requests.exceptions.Timeout as errt:
            print(f"Timeout Error: {errt}")
        except requests.RequestException as err:
            print(f"Error during requests to {url}: {str(err)}")
    return data

def scrape(list_of_websites: list) -> None:
    data = create_dataset(list_of_websites)

    current_time = datetime.now().strftime("%d%H%M%S")
    dataset_filename = f"./data/dataset_{current_time}.txt"

    with open(dataset_filename, "w", encoding="utf-8") as file:
        for paragraph in data:
            file.write("\n" + paragraph["title"] + "\n")
            file.write(paragraph["body"]+"\n\n")


## FRAGMENTING DOCUMENTS
def split_documents():
    """Load the most recent file from the data folder, split it into chunks, embed each chunk and load it into the vector store."""
    data_folder = "./data"
    files = os.listdir(data_folder)
    latest_file = max([os.path.join(data_folder, f) for f in files], key=os.path.getctime)
    raw_documents = TextLoader(latest_file).load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(raw_documents)


def process_articles(urls):
    # generate documents in /data folder
    scrape(urls)
    # read from data folder
    documents = split_documents()
    # create vector store 
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(documents, embeddings)
    return db

[92mOPENAI API KEY DETECTED[0m


In [2]:
def query_llm(query, context):
    llm = OpenAI()
    template = """
    You are an AI assistant specializing in armed conflicts and international relations.
    Use the following context to answer the question. If you can't answer based on the context, say "I don't have enough information to answer that."

    Context: {context}

    Human: {human_input}
    AI Assistant: """
    
    prompt = PromptTemplate(template=template, input_variables=["context", "human_input"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.invoke(context=context, human_input=query)
    return response

In [None]:
def create_timeline(events, vectorstore):
    query = "Create a timeline of important events based on the given information. Format the response as a JSON array of objects, where each object has a 'date' field in 'YYYY-MM-DD' format and a 'description' field."
    docs = vectorstore.similarity_search(query)
    context = "\n".join([doc.page_content for doc in docs])
    timeline_json = query_llm(query, context)
    
    try:
        timeline_data = json.loads(timeline_json)
        return timeline_data
    except json.JSONDecodeError:
        st.error("Failed to parse timeline data. Please try again.")
        return []


# the timeline part

In [27]:
from langchain_community.llms import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
import plotly.express as px
import pandas as pd
from datetime import datetime
import os

conflict_schema = ResponseSchema(name="conflicts", description="List of conflicts")
conflict_parser = StructuredOutputParser.from_response_schemas([conflict_schema])

event_schema = ResponseSchema(name="events", description="List of timeline events")
event_parser = StructuredOutputParser.from_response_schemas([event_schema])


def query_llm_for_conflicts(country):
    llm = OpenAI(temperature=0.7)
    template = """
    You are an AI assistant specializing in armed conflicts and international relations.
    Please provide a list of major armed conflicts that have occurred in {country}.
    Format your response as a JSON array of objects, where each object has the following fields:
    - name: the name of the conflict
    - start_year: the year the conflict started (integer)
    - end_year: the year the conflict ended, or "ongoing" if it's still active (integer or string)

    {format_instructions}

    Human: List conflicts in {country}
    AI Assistant:"""
    
    prompt = PromptTemplate(
        template=template,
        input_variables=["country"],
        partial_variables={"format_instructions": conflict_parser.get_format_instructions()}
    )
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run(country=country)
    return conflict_parser.parse(response)

In [28]:
def query_llm_for_timeline(conflict_name, start_year, end_year):
    llm = OpenAI(temperature=0.7)
    template = """
    You are an AI assistant specializing in armed conflicts and international relations.
    Please provide a timeline of important events for the {conflict_name}, which occurred from {start_year} to {end_year}.
    Format your response as a JSON array of objects, where each object has the following fields:
    - date: the date of the event in "YYYY-MM-DD" format (use an approximate date if the exact date is unknown)
    - description: a brief description of the event

    {format_instructions}

    Human: Create a timeline for {conflict_name}
    AI Assistant:"""
    
    prompt = PromptTemplate(
        template=template,
        input_variables=["conflict_name", "start_year", "end_year"],
        partial_variables={"format_instructions": event_parser.get_format_instructions()}
    )
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run(conflict_name=conflict_name, start_year=start_year, end_year=end_year)
    return event_parser.parse(response)

In [29]:
def visualize_timeline(timeline_data):
    df = pd.DataFrame(timeline_data)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')

    fig = px.timeline(
        df, 
        x_start="date", 
        x_end="date",
        y="description",
        title="Timeline of Important Events"
    )

    fig.update_yaxes(autorange="reversed")
    fig.update_layout(
        height=600,
        title_x=0.5,
        xaxis_title="",
        yaxis_title="",
        font=dict(size=12)
    )

    return fig


In [30]:
country = "sudan"
conflicts_data = query_llm_for_conflicts(country)
conflicts = conflicts_data.get("conflicts", [])
print(conflicts)

[{'name': 'Second Sudanese Civil War', 'start_year': 1983, 'end_year': 2005}, {'name': 'Darfur Conflict', 'start_year': 2003, 'end_year': 'ongoing'}]


In [31]:
if conflicts:
    # st.subheader(f"Conflicts in {country}")
    # for conflict in conflicts:
    conflict = conflicts[0]
    end_year = conflict['end_year'] if conflict['end_year'] != "ongoing" else "Present"
        # if st.button(f"{conflict['name']}: {conflict['start_year']} – {end_year}"):
            # with st.spinner("Generating timeline..."):
    timeline_data = query_llm_for_timeline(conflict['name'], conflict['start_year'], end_year)
    timeline_events = timeline_data.get("events", [])
            
    if timeline_events:
        # timeline_fig = visualize_timeline(timeline_events)
        print("timeline events: ", timeline_events)
        # st.plotly_chart(timeline_fig, use_container_width=True)
    else:
        st.write("No timeline data available.")
else:
    st.write("No conflicts found for the specified country.")

timeline events:  [{'date': '1983-03-18', 'description': "Sudan People's Liberation Army/Movement (SPLA/M) is formed, led by John Garang"}, {'date': '1983-05-16', 'description': 'The SPLA/M launches its first military action against the Sudanese government'}, {'date': '1989-06-30', 'description': 'Omar al-Bashir takes power in a military coup, beginning his 30-year rule'}, {'date': '1991-08-28', 'description': 'SPLA/M splits into two factions, led by John Garang and Riek Machar'}, {'date': '1995-04-21', 'description': 'SPLA/M signs a peace agreement with the government, but it falls apart within months'}, {'date': '1998-07-30', 'description': 'SPLA/M launches a major offensive'}]


In [32]:
timeline_events = [event for event in timeline_events if event.get('date') and event.get('description')]
timeline_events

[{'date': '1983-03-18',
  'description': "Sudan People's Liberation Army/Movement (SPLA/M) is formed, led by John Garang"},
 {'date': '1983-05-16',
  'description': 'The SPLA/M launches its first military action against the Sudanese government'},
 {'date': '1989-06-30',
  'description': 'Omar al-Bashir takes power in a military coup, beginning his 30-year rule'},
 {'date': '1991-08-28',
  'description': 'SPLA/M splits into two factions, led by John Garang and Riek Machar'},
 {'date': '1995-04-21',
  'description': 'SPLA/M signs a peace agreement with the government, but it falls apart within months'},
 {'date': '1998-07-30', 'description': 'SPLA/M launches a major offensive'}]

In [35]:
import streamlit as st
from streamlit_timeline import st_timeline

st.set_page_config(layout="wide")

events = [{'date': '1983-03-18',
  'description': "Sudan People's Liberation Army/Movement (SPLA/M) is formed, led by John Garang"},
 {'date': '1983-05-16',
  'description': 'The SPLA/M launches its first military action against the Sudanese government'},
 {'date': '1989-06-30',
  'description': 'Omar al-Bashir takes power in a military coup, beginning his 30-year rule'},
 {'date': '1991-08-28',
  'description': 'SPLA/M splits into two factions, led by John Garang and Riek Machar'},
 {'date': '1995-04-21',
  'description': 'SPLA/M signs a peace agreement with the government, but it falls apart within months'},
 {'date': '1998-07-30', 'description': 'SPLA/M launches a major offensive'}]

items = [{"id": idx + 1, "content": event["description"], "start": event["date"]} for idx, event in enumerate(events)]
items
# items = [
#     {"id": 1, "content": "2022-10-20", "start": "2022-10-20"},
#     {"id": 2, "content": "2022-10-09", "start": "2022-10-09"},
#     {"id": 3, "content": "2022-10-18", "start": "2022-10-18"},
#     {"id": 4, "content": "2022-10-16", "start": "2022-10-16"},
#     {"id": 5, "content": "2022-10-25", "start": "2022-10-25"},
#     {"id": 6, "content": "2022-10-27", "start": "2022-10-27"},
# ]




[{'id': 1,
  'content': "Sudan People's Liberation Army/Movement (SPLA/M) is formed, led by John Garang",
  'start': '1983-03-18'},
 {'id': 2,
  'content': 'The SPLA/M launches its first military action against the Sudanese government',
  'start': '1983-05-16'},
 {'id': 3,
  'content': 'Omar al-Bashir takes power in a military coup, beginning his 30-year rule',
  'start': '1989-06-30'},
 {'id': 4,
  'content': 'SPLA/M splits into two factions, led by John Garang and Riek Machar',
  'start': '1991-08-28'},
 {'id': 5,
  'content': 'SPLA/M signs a peace agreement with the government, but it falls apart within months',
  'start': '1995-04-21'},
 {'id': 6,
  'content': 'SPLA/M launches a major offensive',
  'start': '1998-07-30'}]

In [None]:

timeline = st_timeline(items, groups=[], options={}, height="300px")
st.subheader("Selected item")
st.write(timeline)

In [33]:
visualize_timeline(timeline_events)