# American Samoa Rainfall Prediction Testing Notebook

This notebook allows you to test and run the entire rainfall prediction pipeline from data processing to ensemble model training. It provides an interactive way to adjust parameters and run each step of the pipeline.

## Setup

First, let's set up the environment and import necessary libraries.

In [10]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path
from IPython.display import display, HTML
import yaml

# Define project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd()))
print(f"Project root: {PROJECT_ROOT}")

# Add all script directories to path
sys.path.append(os.path.join(PROJECT_ROOT, '1_Process_Rainfall_Data', 'scripts'))
sys.path.append(os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'scripts'))
sys.path.append(os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'scripts'))
sys.path.append(os.path.join(PROJECT_ROOT, '4_Train_Best_Model', 'scripts'))
sys.path.append(os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'scripts'))

# Set up TensorFlow
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

Project root: /Users/jlee/Desktop/github/AS_rainfall
TensorFlow version: 2.18.0
GPU available: []


## Helper Functions

These functions will help us run terminal commands and import modules from the pipeline.

In [2]:
# Function to run terminal commands
def run_command(command):
    """Run a terminal command and return the output"""
    import subprocess
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"Error executing command: {command}")
        print(f"Error message: {result.stderr}")
    return result.stdout

# Function to import a module dynamically
def import_module_from_file(module_path):
    """Import a module from a file path"""
    import importlib.util
    module_name = os.path.basename(module_path).replace('.py', '')
    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

## 1. Process Rainfall Data

This section processes raw rainfall data into monthly aggregates.

In [8]:
# Import the rainfall processing module
rainfall_daily_to_monthly_path = os.path.join(PROJECT_ROOT, '1_Process_Rainfall_Data', 'scripts', 'rainfall_daily_to_monthly.py')
rainfall_module = import_module_from_file(rainfall_daily_to_monthly_path)

# Define parameters
input_dir = os.path.join(PROJECT_ROOT, 'raw_data', 'rainfall')
output_dir = os.path.join(PROJECT_ROOT, '1_Process_Rainfall_Data', 'output', 'monthly_rainfall')

# Create a class to mimic argparse
class Args:
    def __init__(self, input_dir, output_dir):
        self.input = input_dir
        self.output = output_dir

args = Args(input_dir, output_dir)

# Run the rainfall processing
print(f"Processing rainfall data from {input_dir} to {output_dir}")
try:
    cmd = f"python3 {rainfall_daily_to_monthly_path} --input_dir {input_dir} --output_dir {output_dir}"
    output = run_command(cmd)
    print(output)
except Exception as e:
    print(f"Error processing rainfall data: {e}")
    # Alternatively, run as a terminal command
    cmd = f"python3 {rainfall_daily_to_monthly_path} --input {input_dir} --output {output_dir}"
    output = run_command(cmd)
    print(output)

Processing rainfall data from /Users/jlee/Desktop/github/AS_rainfall/raw_data/rainfall to /Users/jlee/Desktop/github/AS_rainfall/1_Process_Rainfall_Data/output/monthly_rainfall
Processing rainfall data from /Users/jlee/Desktop/github/AS_rainfall/raw_data/rainfall
Output will be saved to /Users/jlee/Desktop/github/AS_rainfall/1_Process_Rainfall_Data/output/monthly_rainfall
Processing rainfall data from /Users/jlee/Desktop/github/AS_rainfall/raw_data/rainfall to /Users/jlee/Desktop/github/AS_rainfall/1_Process_Rainfall_Data/output/monthly_rainfall
Processed: satala.csv -> satala_monthly.csv [1/21]
Processed: airport5101.csv -> airport5101_monthly.csv [2/21]
Processed: pioa_afono.csv -> pioa_afono_monthly.csv [3/21]
Processed: aua.csv -> aua_monthly.csv [4/21]
Processed: siufaga_WRCC.csv -> siufaga_WRCC_monthly.csv [5/21]
Processed: airport80.csv -> airport80_monthly.csv [6/21]
Processed: vaipito_res.csv -> vaipito_res_monthly.csv [7/21]
Processed: aasufou80.csv -> aasufou80_monthly.csv [

## 2. Create ML Data

This section creates machine learning datasets by combining rainfall data with climate variables and DEM data.

In [13]:
from utils.config_utils import load_config

In [14]:
# Create a custom configuration file
custom_config_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'config', 'custom_config.yaml')

# Define custom configuration
custom_config = {
    'paths': {
        'dem': "raw_data/DEM/DEM_Tut1.tif",
        'climate_data': "2_Create_ML_Data/output/processed_climate_data.nc",
        'raw_climate': "raw_data/climate_variables",
        'rainfall': "1_Process_Rainfall_Data/output/monthly_rainfall",
        'stations': "raw_data/AS_raingages/as_raingage_list2.csv",
        'output': "2_Create_ML_Data/output/custom_run"
    },
    'model': {
        'grid_size': 7,  # Changed from 5 to 7
        'patch_sizes': {
            'local': 5,    # Changed from 3 to 5
            'regional': 3
        },
        'km_per_cell': {
            'local': 1.5,  # Changed from 2 to 1.5
            'regional': 8
        }
    }
}

# Save the custom configuration to a YAML file
os.makedirs(os.path.dirname(custom_config_path), exist_ok=True)
with open(custom_config_path, 'w') as f:
    yaml.dump(custom_config, f, default_flow_style=False)

# Load the custom configuration
custom_config_loaded = load_config(custom_config_path)

# Display the custom configuration
print("Custom Configuration:")
for key, value in custom_config_loaded.items():
    if key not in ['patch_sizes', 'km_per_cell']:
        print(f"  {key}: {value}")
print(f"  patch_sizes: local={custom_config_loaded['patch_sizes']['local']}, regional={custom_config_loaded['patch_sizes']['regional']}")
print(f"  km_per_cell: local={custom_config_loaded['km_per_cell']['local']}, regional={custom_config_loaded['km_per_cell']['regional']}")

Custom Configuration:
  dem_path: /Users/jlee/Desktop/github/AS_rainfall/raw_data/DEM/DEM_Tut1.tif
  climate_data_path: /Users/jlee/Desktop/github/AS_rainfall/2_Create_ML_Data/output/processed_climate_data.nc
  raw_climate_dir: /Users/jlee/Desktop/github/AS_rainfall/raw_data/climate_variables
  rainfall_dir: /Users/jlee/Desktop/github/AS_rainfall/1_Process_Rainfall_Data/output/monthly_rainfall
  station_locations_path: /Users/jlee/Desktop/github/AS_rainfall/raw_data/AS_raingages/as_raingage_list2.csv
  output_dir: /Users/jlee/Desktop/github/AS_rainfall/2_Create_ML_Data/output/custom_run
  grid_size: 7
  patch_sizes: local=5, regional=3
  km_per_cell: local=1.5, regional=8


In [6]:
# Import the ML data creation module
pipeline_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'scripts', 'rainfall_prediction_pipeline.py')

# Run the pipeline as a terminal command
print("Creating ML datasets...")
cmd = f"python3 {pipeline_path}"
output = run_command(cmd)
print(output)

Creating ML datasets...
Found 11 raw climate data files in /Users/jlee/Desktop/github/AS_rainfall/raw_data/climate_variables
Found existing processed climate data at: /Users/jlee/Desktop/github/AS_rainfall/2_Create_ML_Data/output/processed_climate_data.nc



## 3. Hyperparameter Tuning

This section performs hyperparameter tuning for the LAND-inspired model.

In [None]:
# Import the hyperparameter tuning module
tuning_path = os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'scripts', 'extended_hyperparameter_tuning.py')
tuning_module = import_module_from_file(tuning_path)

# Define parameters
features_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'output', 'csv_data', 'features.csv')
targets_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'output', 'csv_data', 'targets.csv')
test_indices_path = os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'output', 'test_indices.pkl')
output_dir = os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'output', 'land_model_extended_tuner')
max_trials = 10  # Reduced for testing
epochs = 20      # Reduced for testing
resume = True    # Resume from previous tuning if available

# Create a namespace to mimic argparse
class TuningArgs:
    def __init__(self, features_path, targets_path, test_indices_path, output_dir, max_trials, epochs, resume):
        self.features_path = features_path
        self.targets_path = targets_path
        self.test_indices_path = test_indices_path
        self.output_dir = output_dir
        self.max_trials = max_trials
        self.executions_per_trial = 1
        self.epochs = epochs
        self.batch_size = 314
        self.n_folds = 5
        self.cv_seed = 42
        self.resume = resume

tuning_args = TuningArgs(
    features_path=features_path,
    targets_path=targets_path,
    test_indices_path=test_indices_path,
    output_dir=output_dir,
    max_trials=max_trials,
    epochs=epochs,
    resume=resume
)

# Run hyperparameter tuning
print(f"Running hyperparameter tuning with {max_trials} trials and {epochs} epochs per trial")
print(f"Output directory: {output_dir}")

# Option 1: Run the main function directly with our args
# Uncomment this to run directly (may take a long time)
# tuning_module.main(tuning_args)

In [None]:
# Option 2: Run as a terminal command with specific parameters
# This is more flexible and can be interrupted
cmd = f"python3 {tuning_path} \
    --features_path {features_path} \
    --targets_path {targets_path} \
    --test_indices_path {test_indices_path} \
    --output_dir {output_dir} \
    --max_trials {max_trials} \
    --epochs {epochs} \
    {'--resume' if resume else ''}"

print(f"Command: {cmd}")
# Uncomment to run (may take a long time)
# output = run_command(cmd)
# print(output)

## 4. Train Best Model

This section trains the best model using the optimal hyperparameters.

In [None]:
# Import the best model training module
best_model_path = os.path.join(PROJECT_ROOT, '4_Train_Best_Model', 'scripts', 'train_best_model.py')
best_model_module = import_module_from_file(best_model_path)

# Define parameters
data_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'output', 'rainfall_prediction_data.h5')
hyperparams_path = os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'output', 'land_model_extended_tuner', 'current_best_hyperparameters.py')
output_dir = os.path.join(PROJECT_ROOT, '4_Train_Best_Model', 'output')
epochs = 50
batch_size = 32
test_split = 0.1
val_split = 0.1
random_seed = 42

# Create a namespace to mimic argparse
class BestModelArgs:
    def __init__(self, data_path, hyperparams_path, output_dir, epochs, batch_size, test_split, val_split, random_seed):
        self.data_path = data_path
        self.hyperparams_path = hyperparams_path
        self.output_dir = output_dir
        self.epochs = epochs
        self.batch_size = batch_size
        self.test_split = test_split
        self.val_split = val_split
        self.random_seed = random_seed

best_model_args = BestModelArgs(
    data_path=data_path,
    hyperparams_path=hyperparams_path,
    output_dir=output_dir,
    epochs=epochs,
    batch_size=batch_size,
    test_split=test_split,
    val_split=val_split,
    random_seed=random_seed
)

# Run best model training
print(f"Training best model with {epochs} epochs and batch size {batch_size}")
print(f"Output directory: {output_dir}")

# Option 1: Run the main function directly with our args
# Uncomment this to run directly
# best_model_module.main(best_model_args)

In [None]:
# Option 2: Run as a terminal command with specific parameters
cmd = f"python3 {best_model_path} \
    --data_path {data_path} \
    --hyperparams_path {hyperparams_path} \
    --output_dir {output_dir} \
    --epochs {epochs} \
    --batch_size {batch_size} \
    --test_split {test_split} \
    --val_split {val_split} \
    --random_seed {random_seed}"

print(f"Command: {cmd}")
# Uncomment to run
# output = run_command(cmd)
# print(output)

## 5. Train Ensemble Model

This section trains an ensemble of models for improved prediction accuracy.

In [None]:
# Import the ensemble training module
ensemble_path = os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'scripts', 'simple_ensemble.py')
ensemble_module = import_module_from_file(ensemble_path)

# Define parameters
data_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'output', 'rainfall_prediction_data.h5')
hyperparams_path = os.path.join(PROJECT_ROOT, '3_Hyperparameter_Tuning', 'output', 'land_model_extended_tuner', 'current_best_hyperparameters.py')
output_dir = os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'output')
n_folds = 5
n_models_per_fold = 3
epochs = 50
batch_size = 32
random_seed = 42

# Create a namespace to mimic argparse
class EnsembleArgs:
    def __init__(self, data_path, hyperparams_path, output_dir, n_folds, n_models_per_fold, epochs, batch_size, random_seed):
        self.data_path = data_path
        self.hyperparams_path = hyperparams_path
        self.output_dir = output_dir
        self.n_folds = n_folds
        self.n_models_per_fold = n_models_per_fold
        self.epochs = epochs
        self.batch_size = batch_size
        self.random_seed = random_seed

ensemble_args = EnsembleArgs(
    data_path=data_path,
    hyperparams_path=hyperparams_path,
    output_dir=output_dir,
    n_folds=n_folds,
    n_models_per_fold=n_models_per_fold,
    epochs=epochs,
    batch_size=batch_size,
    random_seed=random_seed
)

# Run ensemble training
print(f"Training ensemble with {n_folds} folds, {n_models_per_fold} models per fold, {epochs} epochs")
print(f"Output directory: {output_dir}")

# Option 1: Run the main function directly with our args
# Uncomment this to run directly
# ensemble_module.main(ensemble_args)

In [None]:
# Option 2: Run as a terminal command with specific parameters
cmd = f"python3 {ensemble_path} \
    --data_path {data_path} \
    --hyperparams_path {hyperparams_path} \
    --output_dir {output_dir} \
    --n_folds {n_folds} \
    --n_models_per_fold {n_models_per_fold} \
    --epochs {epochs} \
    --batch_size {batch_size} \
    --random_seed {random_seed}"

print(f"Command: {cmd}")
# Uncomment to run
# output = run_command(cmd)
# print(output)

## Visualize Results

This section visualizes the results from the trained models.

In [None]:
# Load and display ensemble results
ensemble_summary_path = os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'output', 'simple_ensemble', 'ensemble_summary.txt')

if os.path.exists(ensemble_summary_path):
    with open(ensemble_summary_path, 'r') as f:
        ensemble_summary = f.read()
    print("Ensemble Summary:")
    print(ensemble_summary)
else:
    print(f"Ensemble summary not found at {ensemble_summary_path}")

In [None]:
# Display ensemble scatter plot
ensemble_scatter_path = os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'output', 'simple_ensemble', 'ensemble_scatter.png')

if os.path.exists(ensemble_scatter_path):
    from IPython.display import Image
    display(Image(filename=ensemble_scatter_path))
else:
    print(f"Ensemble scatter plot not found at {ensemble_scatter_path}")

## Custom Testing

This section allows you to perform custom tests on the trained models.

In [None]:
# Example: Load and test a specific model
import tensorflow as tf

# Path to a trained model
model_path = os.path.join(PROJECT_ROOT, '5_Train_Ensemble', 'output', 'simple_ensemble', 'fold_1', 'model_1', 'model.h5')

if os.path.exists(model_path):
    # Load the model
    model = tf.keras.models.load_model(model_path)
    print(f"Loaded model from {model_path}")
    print(f"Model summary:")
    model.summary()
else:
    print(f"Model not found at {model_path}")

In [None]:
# Example: Make predictions with a sample input
# This requires loading test data and preprocessing it correctly
# You would need to adapt this to your specific data format

# Load data_utils to help with data loading
data_utils_path = os.path.join(PROJECT_ROOT, '4_Train_Best_Model', 'scripts', 'data_utils.py')
data_utils = import_module_from_file(data_utils_path)

# Example of loading data and making predictions
# Uncomment and adapt as needed

# data_path = os.path.join(PROJECT_ROOT, '2_Create_ML_Data', 'output', 'rainfall_prediction_data.h5')
# data = data_utils.load_data(data_path)
# test_features = data['features']['test']
# test_targets = data['targets']['test']
# 
# # Make predictions
# predictions = model.predict(test_features)
# 
# # Convert back to inches (if needed)
# predictions_inches = predictions * 100
# test_targets_inches = test_targets * 100
# 
# # Plot predictions vs. actual
# plt.figure(figsize=(10, 8))
# plt.scatter(test_targets_inches, predictions_inches, alpha=0.5)
# plt.plot([0, max(test_targets_inches)], [0, max(test_targets_inches)], 'r--')
# plt.xlabel('Actual Rainfall (inches)')
# plt.ylabel('Predicted Rainfall (inches)')
# plt.title('Predicted vs. Actual Rainfall')
# plt.grid(True)
# plt.show()