In [2]:

import pandas as pd
import requests
import io
from zipfile import ZipFile
from datetime import datetime, timedelta
import os

# Create a folder for storing data
os.makedirs("gdelt_energy_news", exist_ok=True)

# Define range of dates: GDELT daily files (15-minute intervals per day)
start_date = datetime(2015, 1, 1)
end_date = datetime(2019, 12, 31)

# Base URL for GDELT Event database (GKG: Global Knowledge Graph)
base_url = "http://data.gdeltproject.org/gkg/"

# Relevant energy-related themes
energy_keywords = ['ENERGY', 'GAS', 'OIL', 'ELECTRICITY', 'FUEL', 'POWER']

# Function to download and filter a single GKG file
def process_file(file_url):
    try:
        r = requests.get(file_url, timeout=30)
        if r.status_code != 200:
            return None
        z = ZipFile(io.BytesIO(r.content))
        name = z.namelist()[0]
        df = pd.read_csv(z.open(name), sep='\t', header=None, quoting=3, lineterminator='\n', encoding='latin1', 
                         names=['GKGRECORDID', 'DATE', 'SourceCollectionIdentifier', 'SourceCommonName',
                                'DocumentIdentifier', 'Counts', 'V2Counts', 'Themes', 'V2Themes', 'Locations',
                                'V2Locations', 'Persons', 'V2Persons', 'Organizations', 'V2Organizations',
                                'V2Tone', 'Dates', 'GCAM', 'SharingImage', 'RelatedImages', 'SocialImageEmbeds',
                                'SocialVideoEmbeds', 'Quotations', 'AllNames', 'Amounts', 'TranslationInfo',
                                'Extras'])

        # Filter by Germany + Energy Themes
        filtered = df[
            df['V2Themes'].fillna('').str.contains('|'.join(energy_keywords), case=False) &
            df['V2Locations'].fillna('').str.contains('Germany', case=False)
        ][['DATE', 'DocumentIdentifier', 'V2Themes', 'V2Locations', 'V2Tone']]

        return filtered
    except Exception as e:
        print(f"Error: {file_url} — {e}")
        return None

# Loop over daily GKG summary files for 2015–2019
current_date = start_date
all_data = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y%m%d')
    hour_strs = ['000000', '150000']  # Two files per day (GKG summary files)
    for hour in hour_strs:
        file_name = f"{date_str}{hour}.gkg.csv.zip"
        file_url = base_url + file_name
        print(f"Fetching: {file_url}")
        filtered_df = process_file(file_url)
        if filtered_df is not None and not filtered_df.empty:
            all_data.append(filtered_df)
    current_date += timedelta(days=1)

# Combine and save
if all_data:
    combined = pd.concat(all_data)
    combined['DATE'] = pd.to_datetime(combined['DATE'], format='%Y%m%d%H%M%S', errors='coerce')
    combined.to_csv("gdelt_energy_news/energy_news_germany_2015_2019.csv", index=False)
    print("✅ Saved filtered energy news to: energy_news_germany_2015_2019.csv")
else:
    print("⚠️ No relevant articles found.")


Fetching: http://data.gdeltproject.org/gkg/20150101000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150101150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150102000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150102150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150103000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150103150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150104000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150104150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150105000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150105150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150106000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150106150000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150107000000.gkg.csv.zip
Fetching: http://data.gdeltproject.org/gkg/20150107150000.gkg.csv.zip
Fetching: http://dat

Tried collecting news data for the next part of the project but no articles were found.</br>
Finished by Jad Akra on Friday 19th of April 2025