# Rainfall Data Preparation: Cleaning and Geocoding

This notebook processes the raw rainfall data to create a clean, tidy, and geo-located dataset. This data will be later merged with the cleaned food price data to form the final panel dataset for analysis.

In [11]:
import pandas as pd
import numpy as np
import os
import time
from geopy.geocoders import Nominatim
from datetime import datetime

RAW_EXCEL_PATH = "raw_dataset/Table 1.2 Amount of Rainfall by Monitoring Station, 2015-2024.xlsx"
CLEANED_DATA_DIR = "data_cleaned"
OUTPUT_FILE_TIDY = os.path.join(CLEANED_DATA_DIR, "rainfall_2021_2024_tidy.csv")
OUTPUT_FILE_MAPREADY = os.path.join(CLEANED_DATA_DIR, "rainfall_2021_2024_mapready.csv")

TARGET_YEARS = ['2021', '2022', '2023', '2024']

os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

In [12]:
def clean_sheet(sheet_name, file_path):
    """Loads a single Excel sheet, cleans up junk rows, and melts the monthly data."""
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=5)
    except Exception as e:
        print(f"Error loading sheet '{sheet_name}': {e}")
        return None

    # Strip spaces from all string cells
    df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)

    # Remove data after the "Notes:" section
    footnote_start = df.apply(lambda row: row.astype(str).str.contains(r'(?i)notes')).any(axis=1)
    if footnote_start.any():
        first_note_index = footnote_start[footnote_start].index[0]
        df = df.loc[:first_note_index - 1]

    # Drop fully empty rows and columns
    df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)

    # Rename and drop non-data columns
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Monitoring Station'})
    if 'Annual' in df.columns:
        df = df.drop(columns=['Annual'])

    df_long = df.melt(
        id_vars=['Monitoring Station'],
        var_name='Month',
        value_name='Rainfall'
    )

    # Add year column
    df_long['Year'] = sheet_name
    
    # Drop rows where the station name or rainfall is missing/junk
    df_long = df_long.dropna(subset=['Monitoring Station', 'Rainfall'])
    df_long = df_long[df_long['Monitoring Station'] != '']
    df_long['Rainfall'] = pd.to_numeric(df_long['Rainfall'], errors='coerce')
    
    return df_long

dfs = []
for year in TARGET_YEARS:
    df_result = clean_sheet(year, RAW_EXCEL_PATH)
    if df_result is not None:
        dfs.append(df_result)
        print(f"  ✅ Processed sheet: {year}. Rows: {len(df_result)}")

if dfs:
    df_all_tidy = pd.concat(dfs, ignore_index=True)
    
    df_all_tidy['Year'] = df_all_tidy['Year'].astype(int)
    
    df_all_tidy.to_csv(OUTPUT_FILE_TIDY, index=False)
    print(f"All rainfall data combined and saved to: {OUTPUT_FILE_TIDY}")
    print(f"Total Tidy Rainfall Rows: {len(df_all_tidy)}")

  df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)
  df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)
  df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)
  df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)


  ✅ Processed sheet: 2021. Rows: 696
  ✅ Processed sheet: 2022. Rows: 695
  ✅ Processed sheet: 2023. Rows: 696
  ✅ Processed sheet: 2024. Rows: 654
All rainfall data combined and saved to: data_cleaned/rainfall_2021_2024_tidy.csv
Total Tidy Rainfall Rows: 2741


In [13]:
# Initialize geolocator
geolocator = Nominatim(user_agent="agriprice_rainfall_geocoder", timeout=10)

# --- Manual Coordinate Fixes ---
MANUAL_COORDS = {
    "CLSU, Nueva Ecija": (15.7328, 120.9310),
    "Molugan-El Salvador, Misamis Oriental (former Lumbia Station)": (8.5745, 124.5382),
    "Port Area (MCO), Manila": (14.5833, 120.9667),
    "Science Garden, Quezon City": (14.6488, 121.0330),
    "NAIA (MIA), Pasay City": (14.5086, 121.0198),
    "Tagbilaran City, Bohol (Station was transferred to Dauis, Bohol)": (9.6174, 123.8611),
    "Cabanatuan, Nueva Ecija (Station was transferred to CLSU, Nueva Ecija)": (15.7328, 120.9310),
    "Sinait, Ilocos Sur (former Vigan Station)": (17.8674, 120.4570),
    "Cubi Point, Subic Bay Olongapo City, Zambales": (14.7708, 120.2608),
    "Mactan International Airport, Cebu": (10.3073, 123.9744),
    "Virac Synop, Catanduanes": (13.5858, 124.2378),
}

def clean_station_name(name: str) -> str:
    name = name.replace(" Synop", "").strip() 
    if "(" in name:
        name = name.split("(")[0].strip()
    return name


stations = df_all_tidy['Monitoring Station'].unique()
coords = []

for i, station in enumerate(stations):
    if station in MANUAL_COORDS:
        lat, lon = MANUAL_COORDS[station]
        print(f"[{i+1}/{len(stations)}] Manual hit: {station}")
    else:
        clean_name = clean_station_name(station)
        location = None
        
        queries = [
            f"{clean_name} PAGASA Weather Station, Philippines",
            f"{clean_name}, Philippines",
        ]
        
        for q in queries:
            try:
                location = geolocator.geocode(q)
                if location:
                    break
            except Exception as e:
                print(f"⚠️ Geocoding error for {station}: {e}")
            time.sleep(1) # Delay to prevent rate limiting

        if location:
            lat, lon = location.latitude, location.longitude
            print(f"[{i+1}/{len(stations)}] Found: {station}")
        else:
            lat, lon = None, None
            print(f"[{i+1}/{len(stations)}] Could not locate: {station}")

    coords.append({
        'Monitoring Station': station,
        'Latitude': lat,
        'Longitude': lon
    })

coords_df = pd.DataFrame(coords)
df_mapready = df_all_tidy.merge(coords_df, on='Monitoring Station', how='left') 

df_mapready.to_csv(OUTPUT_FILE_MAPREADY, index=False)
print(f"\n FINAL Map-ready dataset saved to: {OUTPUT_FILE_MAPREADY}")

missing_stations = coords_df[coords_df['Latitude'].isnull()]['Monitoring Station'].tolist()
if missing_stations:
    print(f"\n Still missing coordinates for {len(missing_stations)} station(s):")
    for m in missing_stations:
        print(f"- {m}")
else:
    print("\n All stations successfully geocoded!")

[1/61] Found: Alabat, Quezon
[2/61] Found: Ambulong, Batangas
[3/61] Found: Aparri, Cagayan
[4/61] Found: Baguio City, Benguet
[5/61] Found: Baler, Aurora (Radar)
[6/61] Found: Basco, Batanes (Radar)
[7/61] Found: Borongan, Eastern Samar
[8/61] Found: Butuan City, Agusan Del Norte
[9/61] Manual hit: Cabanatuan, Nueva Ecija (Station was transferred to CLSU, Nueva Ecija)
[10/61] Found: Calapan, Oriental Mindoro
[11/61] Found: Calayan, Cagayan
[12/61] Found: Casiguran, Aurora
[13/61] Found: Catarman, Northern Samar
[14/61] Found: Catbalogan, Western Samar
[15/61] Found: Clark International Airport, Pampanga
[16/61] Manual hit: CLSU, Nueva Ecija
[17/61] Found: Coron, Palawan
[18/61] Found: Cotabato City, Maguindanao
[19/61] Manual hit: Cubi Point, Subic Bay Olongapo City, Zambales
[20/61] Found: Cuyo, Palawan
[21/61] Found: Daet, Camarines Norte
[22/61] Found: Dagupan City, Pangasinan
[23/61] Found: Dauis, Bohol
[24/61] Found: Davao City, Davao Del Sur
[25/61] Found: Dipolog, Zamboanga Del