### Data Collection of Natural Disasters in Brazil Across 10 years (if possible with Reddit)

In [27]:
import praw
import pandas as pd
from datetime import datetime
import time
import os
from secretcodes import reddit_secret, reddit_client, open_api_key

# configure your API credentials
reddit = praw.Reddit(
    client_id=reddit_client,
    client_secret=reddit_secret,
    user_agent="brazil_disasters_scraper by u/haleyhernan"
)


In [20]:
english_terms = [
    "natural disaster", "flood", "landslide", "brazil", "mudslide", "rockslide",
    "tragedy", "emergency", "heavy rain", "storm", "road blocked", "evacuate",
    "catastrophe", "hazard", "crisis", "search and rescue", "disaster zone",
    "road closure", "highway closed", "emergency response", "disaster relief",
    "intense", "torrential", "severe", "extreme", "floodwaters", "flash flood",
    "collapse", "destruction", "damage", "injured", "dead", "casualties",
    "missing", "fatalities", "mountain", "hillside", "slum", "earth movement",
    "soil erosion", "slope"
]

portuguese_terms = [
    "desastre natural", "enchente", "deslizamento de terra", "brasil", "deslizamento de lama",
    "deslizamento de rochas", "tragédia", "emergência", "chuva forte", "tempestade",
    "estrada bloqueada", "evacuar", "catástrofe", "perigo", "crise", "busca e resgate",
    "zona de desastre", "fechamento de estrada", "rodovia fechada", "resposta de emergência",
    "ajuda humanitária em casos de desastres", "intenso", "torrencial", "forte", "extremo",
    "águas da enchente", "inundação repentina", "colapso", "destruição", "dano", "ferido",
    "morto", "baixas", "ausente", "fatalidades", "montanha", "encosta", "favela",
    "movimento da terra", "erosão do solo", "declive"
]

spanish_terms = [
    "desastre natural", "inundación", "corrimiento de tierras", "brasil", "avalancha de lodo",
    "deslizamiento de rocas", "tragedia", "emergencia", "lluvia pesada", "tormenta",
    "carretera bloqueada", "evacuar", "catástrofe", "peligro", "crisis", "búsqueda y rescate",
    "zona de desastre", "cierre de carretera", "autopista cerrada", "respuesta de emergencia",
    "socorro en casos de desastre", "intenso", "torrencial", "severo", "extremo",
    "aguas de la inundación", "inundación repentina", "colapso", "destrucción", "daño",
    "herido", "muerto", "bajas", "desaparecido", "muertes", "montaña", "ladera", "barrio bajo",
    "movimiento de la tierra", "erosión del suelo", "pendiente"
]


In [21]:
def chunk_query_terms(terms, chunk_size=10):
    """Yield chunks of terms to keep queries short and API-safe."""
    for i in range(0, len(terms), chunk_size):
        yield terms[i:i + chunk_size]

def build_query(base, terms):
    """Builds a Reddit search query like (brazil) AND (term1 OR term2 ...)"""
    combined = " OR ".join(f'"{t}"' for t in terms)
    return f"({base}) AND ({combined})"

# Combine all three language term lists
all_terms = english_terms + portuguese_terms + spanish_terms

# Iterate through chunks to generate multiple safe queries
queries = [build_query("Brazil", chunk) for chunk in chunk_query_terms(all_terms, chunk_size=10)]

# Print a few to preview
for q in queries[:3]:
    print(q)
    print("---")


(Brazil) AND ("natural disaster" OR "flood" OR "landslide" OR "brazil" OR "mudslide" OR "rockslide" OR "tragedy" OR "emergency" OR "heavy rain" OR "storm")
---
(Brazil) AND ("road blocked" OR "evacuate" OR "catastrophe" OR "hazard" OR "crisis" OR "search and rescue" OR "disaster zone" OR "road closure" OR "highway closed" OR "emergency response")
---
(Brazil) AND ("disaster relief" OR "intense" OR "torrential" OR "severe" OR "extreme" OR "floodwaters" OR "flash flood" OR "collapse" OR "destruction" OR "damage")
---


In [22]:
DATA_FILE = "brazil_landslides_reddit.csv"
SAVE_EVERY = 5   # how many batches before saving
BATCH_SIZE = 100 # posts per subreddit per window
SLEEP_BETWEEN_CALLS = 2  # seconds between API calls

QUERY = "(Brazil AND ((landslide OR mudslide)) OR (deslizamento AND terra))"
SUBREDDITS = ["worldnews", "news", "brazil", "environment", "earthscience"]


In [23]:
seen = set()
if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
    try:
        existing = pd.read_csv(DATA_FILE)
        seen.update(existing["id"].astype(str))
        print(f"🔁 Loaded {len(seen)} existing posts from {DATA_FILE}")
    except pd.errors.EmptyDataError:
        print("⚠️ Empty file found, starting fresh.")
else:
    print("📁 No existing data found — starting fresh.")


🔁 Loaded 44 existing posts from brazil_landslides_reddit.csv


In [24]:
def collect_reddit_batch(start_date, end_date, query, subreddits):
    posts = []

    for sub in subreddits:
        print(f"🔎 Searching r/{sub}...")
        try:
            for post in reddit.subreddit(sub).search(query, sort="new", limit=BATCH_SIZE):
                post_date = datetime.utcfromtimestamp(post.created_utc)
                if start_date <= post_date <= end_date:
                    if post.id not in seen:
                        seen.add(post.id)
                        posts.append({
                            "id": post.id,
                            "title": post.title,
                            "subreddit": sub,
                            "url": post.url,
                            "score": post.score,
                            "num_comments": post.num_comments,
                            "created_utc": post.created_utc,
                            "created_date": post_date.strftime("%Y-%m-%d")
                        })
        except Exception as e:
            print(f"⚠️ Error searching r/{sub}: {e}")
        time.sleep(SLEEP_BETWEEN_CALLS)

    print(f"✅ Collected {len(posts)} posts in this batch")
    return pd.DataFrame(posts)


In [25]:
def batch_collect_reddit(start_year=2015, end_year=2025):
    all_batches = []
    batch_count = 0

    # Build all multilingual query chunks
    all_terms = english_terms + portuguese_terms + spanish_terms
    queries = [build_query("Brazil", chunk) for chunk in chunk_query_terms(all_terms, chunk_size=10)]

    for year in range(start_year, end_year + 1):
        for month in range(1, 13, 3):  # 3-month windows
            start_date = datetime(year, month, 1)
            end_month = min(month + 2, 12)
            last_day = 30 if end_month != 2 else 28
            end_date = datetime(year, end_month, last_day, 23, 59, 59)

            print(f"\n📆 Collecting {start_date.strftime('%b %Y')} – {end_date.strftime('%b %Y')}")

            # loop through every multilingual query chunk
            for query in queries:
                print(f"🔍 Running query: {query[:80]}...")
                df_batch = collect_reddit_batch(start_date, end_date, query, SUBREDDITS)

                if not df_batch.empty:
                    all_batches.append(df_batch)
                    batch_count += 1

                # periodic save
                if batch_count % SAVE_EVERY == 0 and all_batches:
                    df_all = pd.concat(all_batches, ignore_index=True)
                    if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
                        old = pd.read_csv(DATA_FILE)
                        df_all = pd.concat([old, df_all], ignore_index=True).drop_duplicates(subset=["id"])
                    df_all.to_csv(DATA_FILE, index=False)
                    print(f"💾 Saved {len(df_all)} total posts so far")
                    all_batches = []

            # safety pause between 3-month windows
            time.sleep(3)

    # final save
    if all_batches:
        df_all = pd.concat(all_batches, ignore_index=True)
        if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
            old = pd.read_csv(DATA_FILE)
            df_all = pd.concat([old, df_all], ignore_index=True).drop_duplicates(subset=["id"])
        df_all.to_csv(DATA_FILE, index=False)
        print(f"\n✅ Finished: {len(df_all)} unique posts saved to {DATA_FILE}")


In [28]:
batch_collect_reddit(start_year=2015, end_year=2025)



📆 Collecting Jan 2015 – Mar 2015
🔍 Running query: (Brazil) AND ("natural disaster" OR "flood" OR "landslide" OR "brazil" OR "mudsl...
🔎 Searching r/worldnews...


KeyboardInterrupt: 