In [20]:
import couchdb
import os
from dotenv import load_dotenv
load_dotenv()
import requests
import json
import traceback
from r2client.R2Client import R2Client as r2
import requests
import json
import csv

import pandas as pd

In [5]:
S3_RDV_ROOT = "rdv/"

ACCOUNT_ID = os.getenv("ACCOUNT_ID")
AWS_SECRET_KEY_ID = os.getenv("AWS_SECRET_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

COUCHDB_HOST = os.getenv("COUCHDB_HOST")
COUCHDB_PORT = os.getenv("COUCHDB_PORT")
COUCHDB_URL = f"http://{COUCHDB_HOST}:{COUCHDB_PORT}"
COUCHDB_USERNAME = os.getenv("COUCHDB_USERNAME")
COUCHDB_PASSWORD = os.getenv("COUCHDB_PASSWORD")
AUTH = (COUCHDB_USERNAME, COUCHDB_PASSWORD)

# Helpers

In [6]:
def connect_to_s3():
    try:
        s3 = r2(
                access_key=AWS_SECRET_KEY_ID,
                secret_key=AWS_SECRET_ACCESS_KEY,
                endpoint=f"https://{ACCOUNT_ID}.r2.cloudflarestorage.com",
        )
        print("S3 client for Cloudflare R2 successfully created")
        return s3
    except Exception as e:
        print(e)
        traceback.print_exc()
        return None
    
def _make_couchdb_request(method, url, auth=None, **kwargs):
    """
    Makes an HTTP request to CouchDB and handles common errors.
    """
    try:
        response = requests.request(method, url, auth=auth, **kwargs)
        response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
        return response.json()
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} {e.response.reason}")
        try:
            print(f"CouchDB Error: {e.response.json()}")
        except json.JSONDecodeError:
            print(f"CouchDB Response (not JSON): {e.response.text}")
        return None
    except requests.exceptions.ConnectionError as e:
        print(f"Connection Error: Could not connect to CouchDB at {url}. Details: {e}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"An unexpected error occurred: {e}")
        return None

def create_couchdb_index(db_name, index_fields, ddoc_name=None, index_name=None, index_type="json"):
    """
    Creates a Mango query index in a CouchDB database.

    Args:
        db_name (str): The name of the database.
        index_fields (list): A list of field names to include in the index.
                             e.g., ["name", "age"]
        ddoc_name (str, optional): The name of the design document.
                                   Defaults to "ddoc_<first_field>".
        index_name (str, optional): The name of the index.
                                    Defaults to "idx_<first_field>".
        index_type (str, optional): The type of index, usually "json".

    Returns:
        dict: The JSON response from CouchDB if successful, None otherwise.
    """
    if not index_fields:
        print("Error: index_fields list cannot be empty.")
        return None

    url = f"{COUCHDB_URL}/{db_name}/_index"

    if ddoc_name is None:
        ddoc_name = f"ddoc_{index_fields[0].lower().replace('.', '_')}" # Sanitize field name for ddoc
    if index_name is None:
        index_name = f"idx_{'_'.join(f.lower().replace('.', '_') for f in index_fields)}"

    payload = {
        "index": {"fields": index_fields},
        "ddoc": ddoc_name,
        "name": index_name,
        "type": index_type
    }

    print(f"Attempting to create index '{index_name}' in ddoc '{ddoc_name}' for fields {index_fields} in database '{db_name}'...")
    headers = {'Content-Type': 'application/json'}
    response_data = _make_couchdb_request("POST", url, auth=AUTH, json=payload, headers=headers)

    if response_data:
        if response_data.get("result") == "created":
            print(f"Index '{index_name}' created successfully.")
        elif response_data.get("result") == "exists":
            print(f"Index '{index_name}' already exists.")
        else:
            print(f"Index creation status: {response_data.get('result', 'unknown')}")
    return response_data

def query_couchdb_documents(db_name, selector, limit=100, fields=None, sort=None, use_index=None):
    """
    Queries documents in a CouchDB database using Mango _find.

    Args:
        db_name (str): The name of the database.
        selector (dict): The Mango query selector.
                         e.g., {"name": "Alice"} for documents where name is Alice
                         e.g., {} to select all documents (respecting limit)
        limit (int, optional): The maximum number of documents to return. Defaults to 100.
        fields (list, optional): A list of fields to return for each document.
                                 If None, all fields are returned.
        sort (list, optional): A list of sort criteria.
                               e.g., [{"name": "asc"}]
        use_index (str or list, optional): Hint to CouchDB to use a specific index.
                                           Can be a design doc name (e.g., "ddoc_name")
                                           or a list [ddoc_name, index_name].

    Returns:
        list: A list of documents matching the query, or an empty list if no matches or error.
    """
    url = f"{COUCHDB_URL}/{db_name}/_find"
    payload = {
        "selector": selector,
        "limit": limit
    }
    if fields:
        payload["fields"] = fields
    if sort:
        payload["sort"] = sort
    if use_index:
        payload["use_index"] = use_index

    print(f"Querying database '{db_name}' with selector: {selector}, limit: {limit}...")
    headers = {'Content-Type': 'application/json'}
    response_data = _make_couchdb_request("POST", url, auth=AUTH, json=payload, headers=headers)

    if response_data and "docs" in response_data:
        print(f"Found {len(response_data['docs'])} documents.")
        return response_data["docs"]
    else:
        print("No documents found or an error occurred during query.")
        return []

def get_all_couchdb_docs_paginated(db_name, batch_size=1000):
    """
    Generator function to fetch all documents from a CouchDB database in batches.

    Args:
        db_name (str): The name of the database.
        batch_size (int): The number of documents to fetch per request.

    Yields:
        dict: Individual documents from the database.
    """
    print(f"Starting to fetch all documents from '{db_name}' in batches of {batch_size}...")
    start_key = None
    total_docs_fetched = 0

    while True:
        params = {
            "include_docs": "true",
            "limit": batch_size
        }
        if start_key:
            # When using start_key, CouchDB includes the document with that key.
            # To avoid processing it again, we fetch limit + 1 and skip the first
            # if it's the same as the previous last_key, OR use skip=1.
            # A common pattern is to fetch `limit` and then use the ID of the last doc
            # as the `start_key` for the next batch, with `skip=1`.
            params["startkey"] = json.dumps(start_key) # Needs to be JSON encoded string
            params["skip"] = 1 # Skip the start_key doc itself in the new batch

        url = f"{COUCHDB_URL}/{db_name}/_all_docs"
        response_data = _make_couchdb_request("GET", url, auth=AUTH, params=params)

        if not response_data or "rows" not in response_data:
            print("Failed to fetch documents or no rows in response.")
            break

        rows = response_data["rows"]
        if not rows:
            print("No more documents found.")
            break

        docs_in_batch = 0
        for row in rows:
            if "doc" in row and row["doc"]: # Ensure the doc exists and is not deleted marker
                # Skip if the doc is a design document, unless explicitly needed
                if not row["id"].startswith("_design/"):
                    yield row["doc"]
                    docs_in_batch += 1
                    total_docs_fetched +=1
            # If skip=1 was not used, and this is not the first page,
            # and row['id'] == start_key, then we'd skip it here.
            # But with skip=1, this check isn't strictly needed.

        print(f"  Fetched {docs_in_batch} documents in this batch. Total fetched: {total_docs_fetched}")

        if docs_in_batch < batch_size : # If we fetched fewer docs than requested (excluding skip effect)
             # this means we've reached the end.
             # When using skip=1, if docs_in_batch is 0, it's the end.
             # If docs_in_batch < batch_size, it's also likely the end.
            if start_key and docs_in_batch == 0: # if we used skip=1 and got 0 results
                print("Reached end of documents (skip=1 returned 0).")
                break
            if docs_in_batch < (batch_size if start_key else batch_size): # Heuristic
                 print(f"Fetched {docs_in_batch} which is less than batch_size {batch_size}, assuming end of data.")
                 break


        # Set start_key for the next iteration to the ID of the last document fetched in this batch
        start_key = rows[-1]["id"]

    print(f"Finished fetching. Total documents processed: {total_docs_fetched}")

def write_docs_to_csv(documents_iterable, csv_filepath, fieldnames_mapping):
    """
    Writes documents from an iterable to a CSV file.

    Args:
        documents_iterable: An iterable (e.g., a generator) yielding document dictionaries.
        csv_filepath (str): The path to the output CSV file.
        fieldnames_mapping (dict): A dictionary where keys are CSV header names
                                   and values are functions (or simple string keys)
                                   to extract data from the document.
                                   e.g., {"City": lambda doc: doc.get("location", {}).get("name")}
                                   e.g., {"Provider": "provider"}
    """
    print(f"Writing documents to CSV: {csv_filepath}")
    count = 0
    with open(csv_filepath, 'w', newline='', encoding='utf-8') as csvfile:
        # Use the keys from fieldnames_mapping as the CSV headers
        writer = csv.DictWriter(csvfile, fieldnames=list(fieldnames_mapping.keys()))
        writer.writeheader()

        for doc in documents_iterable:
            row_data = {}
            for csv_header, accessor in fieldnames_mapping.items():
                if callable(accessor):
                    try:
                        row_data[csv_header] = accessor(doc)
                    except Exception as e:
                        # print(f"Warning: Error accessing data for header '{csv_header}' in doc '{doc.get('_id')}': {e}")
                        row_data[csv_header] = None # Or some other placeholder
                elif isinstance(accessor, str):
                    # Simple direct key access, handle potential KeyError
                    keys = accessor.split('.')
                    value = doc
                    try:
                        for key in keys:
                            value = value[key]
                        row_data[csv_header] = value
                    except (KeyError, TypeError):
                        # print(f"Warning: Key '{accessor}' not found or path invalid in doc '{doc.get('_id')}'")
                        row_data[csv_header] = None # Or some other placeholder
                else:
                    row_data[csv_header] = None # Should not happen with correct mapping

            writer.writerow(row_data)
            count += 1
            if count % 500 == 0: # Log progress
                print(f"  Written {count} rows to CSV...")

    print(f"Successfully wrote {count} rows to {csv_filepath}")

# Ingestion

In [7]:
server = couchdb.Server(f"http://{COUCHDB_USERNAME}:{COUCHDB_PASSWORD}@{COUCHDB_HOST}:{COUCHDB_PORT}")
server

<Server 'http://10.34.100.114:5984'>

In [8]:
db = server['free_weather']
db

<Database 'free_weather'>

In [9]:
db_md = server['s3_metadata']
db_md

<Database 's3_metadata'>

In [10]:
s3: r2 = connect_to_s3()
s3

S3 client for Cloudflare R2 successfully created


<r2client.R2Client.R2Client at 0x1a1abb51d90>

In [11]:
create_couchdb_index(db_md.name, index_fields=["data_file_name"], index_name="data-file-name")

Attempting to create index 'data-file-name' in ddoc 'ddoc_data_file_name' for fields ['data_file_name'] in database 's3_metadata'...
Index 'data-file-name' already exists.


{'result': 'exists',
 'id': '_design/ddoc_data_file_name',
 'name': 'data-file-name'}

In [12]:
mango = {
    "selector": {},
    "sort": [{"data_file_name": "desc"}]
}

for row in db_md.find(mango):
    print(row)

<Document '393cd91c7b1e4c78a97314bef8595db0'@'1-b90960d1e7d7a7d60fbd172f3c6ed457' {'descriptor_file_name': '-7_9789104_112_59608_descriptor_2025-05-19_18_13_02_157939.csv', 'data_file_name': '-7_9789104_112_59608_data_2025-05-19_18_13_02_157939.csv'}>
<Document 'f59b3c22c9524990b430cfc062a6cc6f'@'1-aef90468463e3aaeb7c7c15648d2e414' {'descriptor_file_name': '-7_9789104_112_59608_descriptor_2025-05-19_18_07_52_065567.csv', 'data_file_name': '-7_9789104_112_59608_data_2025-05-19_18_07_52_065567.csv'}>
<Document '21759c594ce749dfb5278b3457983007'@'1-b0dfb873201dea84825d697ce8c39d56' {'descriptor_file_name': '-7_9789104_112_59608_descriptor_2025-05-19_17_56_31_459242.csv', 'data_file_name': '-7_9789104_112_59608_data_2025-05-19_17_56_31_459242.csv'}>


In [14]:
latest_file_name = '-7_9789104_112_59608_data_2025-05-19_18_13_02_157939.csv' 

In [57]:
s3.download_file('work', S3_RDV_ROOT + latest_file_name, '../data/localcopy_' + latest_file_name)

File rdv/-7_9789104_112_59608_data_2025-05-19_18_13_02_157939.csv downloaded successfully.


In [15]:
df = pd.read_csv('../data/localcopy_' + latest_file_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175344 entries, 0 to 175343
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   datetime                    175344 non-null  object 
 1   temperature_2m              175328 non-null  float64
 2   is_day                      175344 non-null  int64  
 3   relative_humidity_2m        175328 non-null  float64
 4   dew_point_2m                175328 non-null  float64
 5   apparent_temperature        175328 non-null  float64
 6   precipitation               175328 non-null  float64
 7   weather_code                175328 non-null  float64
 8   pressure_msl                175328 non-null  float64
 9   surface_pressure            175328 non-null  float64
 10  cloud_cover                 175328 non-null  float64
 11  cloud_cover_low             175328 non-null  float64
 12  cloud_cover_mid             175328 non-null  float64
 13  cloud_cover_hi

In [16]:
FIELD_MAPPING = {
    "id": "_id",
    "provider": "provider",
    "location_name": "location.name",
    "region": "location.region",
    "country": "location.country",
    "lat": "location.lat",
    "lon": "location.lon",
    "localtime": "location.localtime",
    "provider_last_updated": "current_weather.last_updated",
    "temp_c": "current_weather.temp_c",
    "temp_f": "current_weather.temp_f",
    "is_day": "current_weather.is_day",
    "weather_desc": "current_weather.condition.text",
    "weather_code": "current_weather.condition.code",
    "wind_mph": "current_weather.wind_mph",
    "wind_kph": "current_weather.wind_kph",
    "pressure_mb": "current_weather.pressure_mb",
    "precip_mm": "current_weather.precip_mm",
    "humidity": "current_weather.humidity",
    "cloud_cover": "current_weather.cloud",
    "feelslike_c": "current_weather.feelslike_c",
    "uv": "current_weather.uv",
    "doc_created_at": "created_at",
    "doc_last_updated": "last_updated"
}

In [17]:
csv_output_file = '../data/localcopy_free_weather_2025-06-03.csv'

In [21]:
all_documents_iterator = get_all_couchdb_docs_paginated(db.name, batch_size=500)

write_docs_to_csv(all_documents_iterator, csv_output_file, FIELD_MAPPING)

print(f"\nCSV generation process complete. Check '{csv_output_file}'.")

Writing documents to CSV: ../data/localcopy_free_weather_2025-06-03.csv
Starting to fetch all documents from 'free_weather' in batches of 500...
  Written 500 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 500
  Written 1000 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 1000
  Written 1500 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 1500
  Written 2000 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 2000
  Written 2500 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 2500
  Written 3000 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 3000
  Written 3500 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 3500
  Written 4000 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 4000
  Written 4500 rows to CSV...
  Fetched 500 documents in this batch. Total fetched: 4500
  Written 5000 rows to CSV...
  Fetched 500 documents in