# testing feature extraction
- the goal is to extract features that pertain to the syrian civil war
- ideas
  - Drought (2006-2010), before the war, but still showed climate was important
  - migration to urban areas 
  - economic hardship (prices of goods)
  - unemployment
  - international intervention
  - refugee flows

In [1]:
from rag_app import get_gdelt_data, scrape, create_dataset, process_articles
import requests
from datetime import datetime, timedelta




[92mOPENAI API KEY DETECTED[0m


In [5]:
end_date = datetime.now()
start_date = end_date - timedelta(days=30)  # Last 30 days

queries = ["Syria drought", "Syria climate disaster", "Syria water scarcity", "Syria civil war"]

print(start_date, end_date)
urls, response = get_gdelt_data(queries, start_date, end_date, max_records=5)
print(urls)

2024-05-26 23:53:08.577166 2024-06-25 23:53:08.577166
['Syria drought', 'Syria climate disaster', 'Syria water scarcity', 'Syria civil war']
Request URL: https://api.gdeltproject.org/api/v2/doc/doc?query=(Syria drought OR Syria climate disaster OR Syria water scarcity OR Syria civil war sourcelang:english)&mode=artlist&format=json&startdatetime=20240526235308&enddatetime=20240625235308&maxrecords=5
['https://www.alanba.com.kw/1261781', 'https://www.vetogate.com/5179640', 'http://arabic.news.cn/20240618/bed9300feb0447bcb87f494de4229c11/c.html', 'https://alghad.com/Section-114/%D9%85%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D8%A7%D9%84%D9%8A%D9%88%D9%85/%D9%83%D8%A7%D8%B1%D8%AB%D8%A9-%D8%B7%D9%88%D9%8A%D9%84%D8%A9-%D8%A7%D9%84%D8%A3%D9%85%D8%AF-1725560', 'https://www.ammonnews.net/article/853821']


In [9]:
from dateutil.relativedelta import relativedelta

def generate_monthly_ranges(start_year=2020):
    current_date = datetime(2020, 12, 1)
    start_date = datetime(start_year, 1, 1)
    date_ranges = []

    while start_date < current_date:
        end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
        date_ranges.append((start_date, end_date))
        start_date += relativedelta(months=1)

    return date_ranges

print(generate_monthly_ranges(2011))

[(datetime.datetime(2011, 1, 1, 0, 0), datetime.datetime(2011, 1, 31, 0, 0)), (datetime.datetime(2011, 2, 1, 0, 0), datetime.datetime(2011, 2, 28, 0, 0)), (datetime.datetime(2011, 3, 1, 0, 0), datetime.datetime(2011, 3, 31, 0, 0)), (datetime.datetime(2011, 4, 1, 0, 0), datetime.datetime(2011, 4, 30, 0, 0)), (datetime.datetime(2011, 5, 1, 0, 0), datetime.datetime(2011, 5, 31, 0, 0)), (datetime.datetime(2011, 6, 1, 0, 0), datetime.datetime(2011, 6, 30, 0, 0)), (datetime.datetime(2011, 7, 1, 0, 0), datetime.datetime(2011, 7, 31, 0, 0)), (datetime.datetime(2011, 8, 1, 0, 0), datetime.datetime(2011, 8, 31, 0, 0)), (datetime.datetime(2011, 9, 1, 0, 0), datetime.datetime(2011, 9, 30, 0, 0)), (datetime.datetime(2011, 10, 1, 0, 0), datetime.datetime(2011, 10, 31, 0, 0)), (datetime.datetime(2011, 11, 1, 0, 0), datetime.datetime(2011, 11, 30, 0, 0)), (datetime.datetime(2011, 12, 1, 0, 0), datetime.datetime(2011, 12, 31, 0, 0)), (datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 31, 

In [15]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

def analyze_content(db, date_range):
    print("running analyze_content for ", date_range)
    
    query = "drought OR 'climate disaster' OR 'water scarcity' OR 'extreme weather' OR 'climate change'"
    docs = db.similarity_search(query)
    
    content = "\n".join([doc.page_content for doc in docs][:3500])

    template = """
    Analyze the following text for mentions of drought, climate disasters, or related issues in Syria for the period {start_date} to {end_date}. 
    Rate the severity of the situation on a scale from 0 to 10, where 0 is no mention and 10 is extremely severe.
    Provide a brief explanation for your rating.

    Text: {content}

    Rating (0-10):
    Explanation:
    """

    prompt = PromptTemplate(
        input_variables=["start_date", "end_date", "content"],
        template=template
    )

    llm = OpenAI(temperature=0)
    chain = LLMChain(llm=llm, prompt=prompt)

    result = chain.run(start_date=date_range[0], end_date=date_range[1], content=content)
    return result

In [16]:
date_ranges = generate_monthly_ranges()
results = []

for start_date, end_date in date_ranges[:2]:
    queries = ["Syria drought", "Syria civil war", "Syria climate change"]
    print(start_date, end_date)
    urls, _ = get_gdelt_data(queries, start_date, end_date, max_records=10)
    print("urls: ", urls)
    db = process_articles(urls)
    analysis = analyze_content(db, (start_date, end_date))
    results.append({
        "period": f"{start_date.strftime('%Y-%m')}",
        "analysis": analysis
    })

# Save results to a file
with open("syria_climate_analysis.txt", "w") as f:
    for result in results:
        f.write(f"Period: {result['period']}\n")
        f.write(f"Analysis: {result['analysis']}\n\n")

print("Analysis complete. Results saved to syria_climate_analysis.txt")


2020-01-01 00:00:00 2020-01-31 00:00:00
['Syria drought', 'Syria civil war', 'Syria climate change']
Request URL: https://api.gdeltproject.org/api/v2/doc/doc?query=(Syria drought OR Syria civil war OR Syria climate change sourcelang:english)&mode=artlist&format=json&startdatetime=20200101000000&enddatetime=20200131000000&maxrecords=10
urls:  ['https://earther.gizmodo.com/the-depressing-reason-more-drought-could-lead-to-less-c-1840977438', 'https://www.gizmodo.com.au/2020/01/the-depressing-reason-more-drought-could-lead-to-less-conflict/', 'https://www.gizmodo.co.uk/2020/01/the-depressing-reason-more-drought-could-lead-to-less-conflict/', 'https://lenta.ru/news/2020/01/30/foodforsyria/', 'https://military.china.com/retie/37485974.html', 'https://gizmodo.com/?startTime=1578958199999', 'https://gizmodo.com/?startTime=1578958440412', 'https://gizmodo.com/?startTime=1578953400833', 'https://gizmodo.com/?startTime=1579014044520', 'https://gizmodo.com/?startTime=1579010400994']
['https://eart

urls:  10%|█         | 1/10 [00:00<00:02,  3.49it/s]

are you stuck here
or here


urls:  20%|██        | 2/10 [00:00<00:03,  2.25it/s]

are you stuck here
or here


urls:  30%|███       | 3/10 [00:01<00:03,  2.23it/s]

HTTP Error: 404 Client Error: Not Found for url: https://gizmodo.com/2020/01/the-depressing-reason-more-drought-could-lead-to-less-conflict


urls:  40%|████      | 4/10 [00:03<00:05,  1.04it/s]

are you stuck here
or here


urls:  50%|█████     | 5/10 [00:03<00:04,  1.05it/s]

HTTP Error: 404 Client Error: Not Found for url: https://military.china.com/retie/37485974.html
are you stuck here
or here


urls:  60%|██████    | 6/10 [00:04<00:03,  1.31it/s]

are you stuck here
or here


urls:  70%|███████   | 7/10 [00:04<00:01,  1.60it/s]

are you stuck here
or here


urls:  80%|████████  | 8/10 [00:05<00:01,  1.84it/s]

are you stuck here
or here


urls:  90%|█████████ | 9/10 [00:05<00:00,  2.00it/s]

are you stuck here
or here


urls: 100%|██████████| 10/10 [00:05<00:00,  1.71it/s]
Created a chunk of size 4541, which is longer than the specified 1000
Created a chunk of size 4270, which is longer than the specified 1000
Created a chunk of size 1745, which is longer than the specified 1000
Created a chunk of size 1014, which is longer than the specified 1000
Created a chunk of size 1014, which is longer than the specified 1000
Created a chunk of size 1014, which is longer than the specified 1000
Created a chunk of size 1014, which is longer than the specified 1000


running analyze_content for  (datetime.datetime(2020, 1, 1, 0, 0), datetime.datetime(2020, 1, 31, 0, 0))
2020-02-01 00:00:00 2020-02-29 00:00:00
['Syria drought', 'Syria civil war', 'Syria climate change']
Request URL: https://api.gdeltproject.org/api/v2/doc/doc?query=(Syria drought OR Syria civil war OR Syria climate change sourcelang:english)&mode=artlist&format=json&startdatetime=20200201000000&enddatetime=20200229000000&maxrecords=10
urls:  ['http://www.hnn.us/article/174191', 'http://historynewsnetwork.org/article/174191', 'http://hnn.us/article/174191', 'https://historynewsnetwork.org/article/174191', 'http://alrai.com/article/10523535', 'https://www.hpr2.org/post/no-security-solutions-world-ignores-climate-change', 'https://www.hawaiipublicradio.org/post/no-security-solutions-world-ignores-climate-change', 'https://www.delfi.lv/news/arzemes/francija-aizturets-sirijas-islamists-kuru-apsudz-kara-noziegumos.d?id=51847475', 'http://www.sohu.com/a/371089155_120270994', 'https://www.e

urls:  10%|█         | 1/10 [00:00<00:05,  1.75it/s]

are you stuck here
or here


urls:  20%|██        | 2/10 [00:00<00:03,  2.33it/s]

are you stuck here
or here


urls:  30%|███       | 3/10 [00:01<00:02,  2.63it/s]

are you stuck here
or here


urls:  40%|████      | 4/10 [00:01<00:02,  2.36it/s]

are you stuck here
or here
HTTP Error: 403 Client Error: Forbidden for url: http://alrai.com/article/10523535


urls:  60%|██████    | 6/10 [00:02<00:01,  2.90it/s]

Error Connecting: HTTPSConnectionPool(host='www.hpr2.org', port=443): Max retries exceeded with url: /post/no-security-solutions-world-ignores-climate-change (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'www.hpr2.org'. (_ssl.c:1000)")))


urls:  70%|███████   | 7/10 [00:03<00:01,  2.10it/s]

are you stuck here
or here


urls:  80%|████████  | 8/10 [00:04<00:01,  1.32it/s]

are you stuck here
or here


urls:  90%|█████████ | 9/10 [00:05<00:00,  1.12it/s]

are you stuck here
or here


urls: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]
Created a chunk of size 8867, which is longer than the specified 1000
Created a chunk of size 8867, which is longer than the specified 1000
Created a chunk of size 8867, which is longer than the specified 1000
Created a chunk of size 8867, which is longer than the specified 1000
Created a chunk of size 4505, which is longer than the specified 1000
Created a chunk of size 1803, which is longer than the specified 1000
Created a chunk of size 10010, which is longer than the specified 1000


are you stuck here
or here
running analyze_content for  (datetime.datetime(2020, 2, 1, 0, 0), datetime.datetime(2020, 2, 29, 0, 0))
Analysis complete. Results saved to syria_climate_analysis.txt
