## Data Collection of Brazilian Natural Disaster Reddit Posts 

Across 10 years, we will collect posts in all languages. There are some limitations on speed, since the API doesn't let you do too many requests in a quick time frame, since we're looking over many queries this makes it harder. 

In [1]:
import praw
import pandas as pd
from datetime import datetime
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from secretcodes import reddit_secret, reddit_client

# Configure Reddit API
reddit = praw.Reddit(
    client_id=reddit_client,
    client_secret=reddit_secret,
    user_agent="brazil_disasters_scraper by u/haleyhernan"
)


Our terms from the database on the Google Sheets. 

In [2]:
english_terms = [
    "natural disaster", "flood", "landslide", "brazil", "mudslide", "rockslide",
    "tragedy", "emergency", "heavy rain", "storm", "road blocked", "evacuate",
    "catastrophe", "hazard", "crisis", "search and rescue", "disaster zone",
    "road closure", "highway closed", "emergency response", "disaster relief",
    "intense", "torrential", "severe", "extreme", "floodwaters", "flash flood",
    "collapse", "destruction", "damage", "injured", "dead", "casualties",
    "missing", "fatalities", "mountain", "hillside", "slum", "earth movement",
    "soil erosion", "slope"
]

portuguese_terms = [
    "desastre natural", "enchente", "deslizamento de terra", "brasil", "deslizamento de lama",
    "deslizamento de rochas", "tragédia", "emergência", "chuva forte", "tempestade",
    "estrada bloqueada", "evacuar", "catástrofe", "perigo", "crise", "busca e resgate",
    "zona de desastre", "fechamento de estrada", "rodovia fechada", "resposta de emergência",
    "ajuda humanitária em casos de desastres", "intenso", "torrencial", "forte", "extremo",
    "águas da enchente", "inundação repentina", "colapso", "destruição", "dano", "ferido",
    "morto", "baixas", "ausente", "fatalidades", "montanha", "encosta", "favela",
    "movimento da terra", "erosão do solo", "declive"
]

spanish_terms = [
    "desastre natural", "inundación", "corrimiento de tierras", "brasil", "avalancha de lodo",
    "deslizamiento de rocas", "tragedia", "emergencia", "lluvia pesada", "tormenta",
    "carretera bloqueada", "evacuar", "catástrofe", "peligro", "crisis", "búsqueda y rescate",
    "zona de desastre", "cierre de carretera", "autopista cerrada", "respuesta de emergencia",
    "socorro en casos de desastre", "intenso", "torrencial", "severo", "extremo",
    "aguas de la inundación", "inundación repentina", "colapso", "destrucción", "daño",
    "herido", "muerto", "bajas", "desaparecido", "muertes", "montaña", "ladera", "barrio bajo",
    "movimiento de la tierra", "erosión del suelo", "pendiente"
]

all_terms = english_terms + portuguese_terms + spanish_terms


In [3]:
def chunk_query_terms(terms, chunk_size=10):
    for i in range(0, len(terms), chunk_size):
        yield terms[i:i + chunk_size]

def build_query(base, terms):
    combined = " OR ".join(f'"{t}"' for t in terms)
    return f"({base}) AND ({combined})"

# Prepare query chunks
queries = [build_query("Brazil", chunk) for chunk in chunk_query_terms(all_terms, chunk_size=10)]


In [4]:
DATA_FILE = "new_brazil_landslides_reddit.csv"
SAVE_EVERY = 5
BATCH_SIZE = 100
SUBREDDITS = ["worldnews", "news", "brazil", "environment", "earthscience"]

seen = set()
if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
    try:
        existing = pd.read_csv(DATA_FILE)
        seen.update(existing["id"].astype(str))
        print(f"🔁 Loaded {len(seen)} existing posts from {DATA_FILE}")
    except pd.errors.EmptyDataError:
        print("⚠️ Empty file found, starting fresh.")
else:
    print("📁 No existing data found — starting fresh.")


📁 No existing data found — starting fresh.


We will be doing parallel batching for data collection to help with speed. 

In [5]:
def collect_reddit_parallel(start_date, end_date, query, subreddits, max_workers=5):
    posts = []

    def fetch_sub(sub):
        sub_posts = []
        try:
            for post in reddit.subreddit(sub).search(query, sort="new", limit=BATCH_SIZE):
                post_date = datetime.utcfromtimestamp(post.created_utc)
                if start_date <= post_date <= end_date and post.id not in seen:
                    seen.add(post.id)
                    sub_posts.append({
                        "id": post.id,
                        "title": post.title,
                        "subreddit": sub,
                        "url": post.url,
                        "score": post.score,
                        "num_comments": post.num_comments,
                        "created_utc": post.created_utc,
                        "created_date": post_date.strftime("%Y-%m-%d")
                    })
        except Exception as e:
            print(f"⚠️ Error fetching {sub}: {e}")
        return sub_posts

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_sub, sub): sub for sub in subreddits}
        for f in as_completed(futures):
            posts.extend(f.result())

    print(f"✅ Collected {len(posts)} posts in this batch")
    return pd.DataFrame(posts)


In [7]:
def batch_collect_reddit_parallel(start_year=2015, end_year=2025, max_workers=3):
    all_batches = []
    batch_count = 0

    for year in range(start_year, end_year + 1):
        for month in range(1, 13, 3):  # 3-month windows
            start_date = datetime(year, month, 1)
            end_month = min(month + 2, 12)
            last_day = 30 if end_month != 2 else 28
            end_date = datetime(year, end_month, last_day, 23, 59, 59)
            print(f"\n📆 Collecting {start_date.strftime('%b %Y')} – {end_date.strftime('%b %Y')}")

            # Parallelize over query chunks
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = {executor.submit(collect_reddit_parallel, start_date, end_date, q, SUBREDDITS, max_workers): q for q in queries}
                for f in as_completed(futures):
                    df_batch = f.result()
                    if not df_batch.empty:
                        all_batches.append(df_batch)
                        batch_count += 1

                    # periodic save
                    if batch_count % SAVE_EVERY == 0 and all_batches:
                        df_all = pd.concat(all_batches, ignore_index=True)
                        if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
                            old = pd.read_csv(DATA_FILE)
                            df_all = pd.concat([old, df_all], ignore_index=True).drop_duplicates(subset=["id"])
                        df_all.to_csv(DATA_FILE, index=False)
                        print(f"💾 Saved {len(df_all)} total posts so far")
                        all_batches = []

    # final save
    if all_batches:
        df_all = pd.concat(all_batches, ignore_index=True)
        if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
            old = pd.read_csv(DATA_FILE)
            df_all = pd.concat([old, df_all], ignore_index=True).drop_duplicates(subset=["id"])
        df_all.to_csv(DATA_FILE, index=False)
        print(f"\n✅ Finished: {len(df_all)} unique posts saved to {DATA_FILE}")


In [10]:
batch_collect_reddit_parallel(start_year=2015, end_year=2025, max_workers=4)



📆 Collecting Jan 2015 – Mar 2015
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Apr 2015 – Jun 2015
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Jul 2015 – Sep 2015
✅ Collected 0 

We are retrieving data missed from the timeouts.

In [11]:
batch_collect_reddit_parallel(start_year=2018, end_year=2019, max_workers=4)


📆 Collecting Jan 2018 – Mar 2018
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Apr 2018 – Jun 2018
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Jul 2018 – Sep 2018
✅ Collected 0 

In [12]:
batch_collect_reddit_parallel(start_year=2022, end_year=2023, max_workers=4)


📆 Collecting Jan 2022 – Mar 2022
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Apr 2022 – Jun 2022
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch
✅ Collected 0 posts in this batch

📆 Collecting Jul 2022 – Sep 2022
✅ Collected 0 