# Step 0: Set up directory

In [None]:
# ------------------------------------------------------------
# Download ~10GB of RAW GDELT GKG files from MAY 2018
# Store masterfile + data in separate folders for GitHub structure
# ------------------------------------------------------------

import os
import requests
from tqdm import tqdm

# ------------------------------------------------------------
# WINDOWS PATH YOU REQUESTED
# ------------------------------------------------------------
BASE_DIR = r"C:\Users\jonat\Documents\UMBC\FALL_2025\Final_Project_UMBC_BIG_DATA\DATA"

MASTER_DIR = os.path.join(BASE_DIR, "master")
RAW_DIR = os.path.join(BASE_DIR, "raw")

MASTERFILE = os.path.join(MASTER_DIR, "masterfilelist.txt")
TARGET_GB = 10

# Ensure folders exist
os.makedirs(MASTER_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)

MASTERLIST_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"


# Step 1: Download masterfilelist.txt → stored in DATA/master

In [3]:
def download_masterfilelist():
    if not os.path.exists(MASTERFILE):
        print("Downloading masterfilelist.txt …")
        r = requests.get(MASTERLIST_URL, timeout=60)
        with open(MASTERFILE, "wb") as f:
            f.write(r.content)
    else:
        print("masterfilelist.txt already exists.")
    return MASTERFILE


master_path = download_masterfilelist()
master_path


Downloading masterfilelist.txt …


'C:\\Users\\jonat\\Documents\\UMBC\\FALL_2025\\Final_Project_UMBC_BIG_DATA\\DATA\\master\\masterfilelist.txt'

# Step 2: Collect all May 2018 GKG URLs

In [4]:
def get_may_2018_gkg_urls(path):
    urls = []

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if "gkg.csv.zip" in line:
                parts = line.strip().split()
                url = parts[-1]
                fname = url.split("/")[-1]

                if fname.startswith("201805"):
                    urls.append((fname, url))

    return urls


may_urls = get_may_2018_gkg_urls(master_path)

print(f"Found {len(may_urls)} GKG files for May 2018.")


Found 2853 GKG files for May 2018.


# Step 3: Download raw files until ~10GB, saved in DATA/raw

In [None]:
def download_until_10gb(url_list):
    total_bytes = 0
    target_bytes = TARGET_GB * 1024 * 1024 * 1024

    for fname, url in tqdm(url_list):
        if total_bytes >= target_bytes:
            print(f"\nReached ~{TARGET_GB}GB. Stopping downloads.")
            break

        out_path = os.path.join(RAW_DIR, fname)

        # Skip existing (resume support)
        if os.path.exists(out_path):
            file_size = os.path.getsize(out_path)
            total_bytes += file_size
            continue

        try:
            r = requests.get(url, stream=True, timeout=60)
            if r.status_code != 200:
                print("Failed:", url)
                continue

            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)

            file_size = os.path.getsize(out_path)
            total_bytes += file_size

            print(f"Downloaded {fname} ({file_size/1024/1024:.2f} MB)")
            print(f"TOTAL: {total_bytes/1024/1024/1024:.2f} GB")

        except Exception as e:
            print("ERROR:", e)
            continue

    print("\nFINAL TOTAL:", total_bytes/1024/1024/1024, "GB")


download_until_10gb(may_urls)
