In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load and preprocess (your existing function)
def load_and_prepare(path, suffix):
    df = pd.read_csv(path, parse_dates=['valid_time'])
    df = df.sort_values('valid_time')
    df['tp'] *= 1000  # m to mm
    df['tp'] = np.log1p(df['tp'])  # log transform precipitation
    df = df[['valid_time', 'u10', 'v10', 't2m', 'sp', 'tp']]
    df.columns = ['valid_time'] + [f'{col}_{suffix}' for col in df.columns if col != 'valid_time']
    return df

# Load data for each region
df_center = load_and_prepare('data/brazil.csv', 'center')
df_north = load_and_prepare('data/brazil_north.csv', 'north')
df_south = load_and_prepare('data/brazil_south.csv', 'south')
df_east  = load_and_prepare('data/brazil_east.csv', 'east')
df_west  = load_and_prepare('data/brazil_west.csv', 'west')

# Merge on valid_time
df = df_center.merge(df_north, on='valid_time')
df = df.merge(df_south, on='valid_time')
df = df.merge(df_east, on='valid_time')
df = df.merge(df_west, on='valid_time')
df = df.sort_values('valid_time').reset_index(drop=True)

# Add cyclical time features
df['dayofyear'] = df['valid_time'].dt.dayofyear
df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)

# Define features to scale (meteorological variables only)
center_features = ['u10_center', 'v10_center', 't2m_center', 'sp_center', 'tp_center']
north_features = [f"{feat}_north" for feat in ['u10', 'v10', 't2m', 'sp', 'tp']]
south_features = [f"{feat}_south" for feat in ['u10', 'v10', 't2m', 'sp', 'tp']]
east_features  = [f"{feat}_east"  for feat in ['u10', 'v10', 't2m', 'sp', 'tp']]
west_features  = [f"{feat}_west"  for feat in ['u10', 'v10', 't2m', 'sp', 'tp']]

all_meteorological_features = center_features + north_features + south_features + east_features + west_features

# Train-test split index (time based)
train_size = int(0.6 * len(df))

# # Fit scaler on training only
scaler = StandardScaler()
scaler.fit(df.loc[:train_size-1, all_meteorological_features])


# Store original scaled target for inverse transform later
tp_scaler = StandardScaler()
df['tp_center_scaled'] = tp_scaler.fit_transform(df[['tp_center']])

# Transform entire dataset (train+val+test)
df[all_meteorological_features] = scaler.transform(df[all_meteorological_features])


# Generate lag, rolling, diff features *after* scaling on tp_center ONLY
lags = [1, 3, 6, 12, 24]
rolling_windows = [3, 6, 12]

for lag in lags:
    df[f'tp_lag_{lag}h'] = df['tp_center'].shift(lag)
for window in rolling_windows:
    df[f'tp_roll_mean_{window}h'] = df['tp_center'].rolling(window).mean()

df['tp_diff_1h'] = df['tp_center'].diff(1)

# Drop NaNs generated by lag/rolling
df.dropna(inplace=True)

# Features for model input: meteorological + cyclical + lags/roll/diff
cyclical_features = ['dayofyear_sin', 'dayofyear_cos']
lag_features = [f'tp_lag_{lag}h' for lag in lags]
roll_features = [f'tp_roll_mean_{w}h' for w in rolling_windows]
diff_features = ['tp_diff_1h']

feature_cols = all_meteorological_features + cyclical_features + lag_features + roll_features + diff_features

print(f"Final feature columns count: {len(feature_cols)}")

# Sequence creation function
def create_sequences(df, seq_len=360, horizon=7):
    X, y = [], []
    for i in range(seq_len, len(df) - horizon):
        X_seq = df.iloc[i - seq_len:i][feature_cols].values
        y_seq = df.iloc[i:i + horizon]['tp_center_scaled'].values

        X.append(X_seq)
        y.append(y_seq)
    return np.array(X), np.array(y)

# Create sequences for forecasting
sequence_length = 360
forecast_horizon = 7
X, y = create_sequences(df, sequence_length, forecast_horizon)

print(f"X shape: {X.shape}, y shape: {y.shape}")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Input, Dropout, GlobalAveragePooling1D, Dense, LayerNormalization, MultiHeadAttention, Add, Concatenate
from tensorflow.keras.models import Model

# Positional encoding
def get_positional_encoding(seq_len, d_model):
    position = np.arange(seq_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    return tf.constant(pe, dtype=tf.float32)

# Hybrid loss
def hybrid_loss(y_true, y_pred):
    mse = tf.keras.losses.MSE(y_true, y_pred)
    mae = tf.keras.losses.MAE(y_true, y_pred)
    return 0.5 * mse + 0.5 * mae

# Weighted loss
def spike_weighted_loss(y_true, y_pred):
    weights = tf.where(y_true > 1.0, 2.0, 1.0)  # spike = tp > 1 mm
    return tf.reduce_mean(weights * tf.square(y_true - y_pred))


# Transformer Encoder block (same as before)
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = Add()([x, inputs])

    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    x = Add()([x, res])
    return x

# Encoder for a single region (shared params)
def build_region_encoder(seq_len, feature_dim, conv_filters=32, kernel_size=3,
                         head_size=16, num_heads=2, ff_dim=64, num_layers=1, dropout=0.1):
    inputs = Input(shape=(seq_len, feature_dim))
    x = Conv1D(conv_filters, kernel_size, padding='causal', activation='relu')(inputs)
    pos_encoding = get_positional_encoding(seq_len, conv_filters)
    pos_encoding = tf.expand_dims(pos_encoding, axis=0)
    x = x + pos_encoding

    for _ in range(num_layers):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = GlobalAveragePooling1D()(x)
    x = Dropout(dropout)(x)
    model = Model(inputs, x)
    return model

import tensorflow as tf

def split_features_by_region(X, feature_names):
    regions = ['center', 'north', 'south', 'east', 'west']
    region_feats = {r: [] for r in regions}

    for idx, name in enumerate(feature_names):
        matched = False
        for r in regions:
            if r in name:
                region_feats[r].append(idx)
                matched = True
                break
        if not matched:
            region_feats['center'].append(idx)  # put engineered features in 'center'

    # Use tf.gather on symbolic tensor to select features
    region_inputs = {}
    for r, idxs in region_feats.items():
        region_inputs[r] = tf.keras.layers.Lambda(
            lambda x, idxs=idxs: tf.gather(x, indices=idxs, axis=2)
        )(X)

    return region_inputs


# Build the full fusion model
def build_multi_encoder_fusion(seq_len, feature_names,
                               conv_filters=32, kernel_size=3,
                               head_size=16, num_heads=2, ff_dim=64,
                               num_layers=1, dropout=0.1, forecast_horizon=7):

    inputs = tf.keras.Input(shape=(seq_len, len(feature_names)))
    region_inputs = split_features_by_region(inputs, feature_names)

    encoded_outputs = []
    for region in ['center', 'north', 'south', 'east', 'west']:
        encoder = build_region_encoder(seq_len, region_inputs[region].shape[-1],
                                       conv_filters, kernel_size, head_size,
                                       num_heads, ff_dim, num_layers, dropout)
        encoded_out = encoder(region_inputs[region])
        encoded_outputs.append(encoded_out)

    fused = Concatenate()(encoded_outputs)
    x = Dropout(dropout)(fused)
    x = Dense(256, activation='relu')(x)
    x = Dropout(dropout)(x)

    # --- MULTI-TASK OUTPUTS ---
    output_regress = Dense(forecast_horizon, name='tp_amount')(x)
    output_classify = Dense(forecast_horizon, activation='sigmoid', name='rain_prob')(x)

    model = Model(inputs, outputs=[output_regress, output_classify])
    return model



# Model comilation
model = build_multi_encoder_fusion(sequence_length, feature_cols)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss={
        'tp_amount': hybrid_loss,
        'rain_prob': 'binary_crossentropy'
    },
    loss_weights={
        'tp_amount': 1.0,
        'rain_prob': 0.3
    }
)

model.summary()


# Train-test split for sequences
train_size = int(0.6 * len(X))
val_size = int(0.2 * len(X))

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

# Define rain threshold in log1p space (e.g. 0.5 mm)
rain_threshold = np.log1p(0.5)

# Create binary rain label
y_rain = (y > rain_threshold).astype(int)

# Train/val/test split for binary rain labels
y_rain_train = y_rain[:train_size]
y_rain_val = y_rain[train_size:train_size + val_size]
y_rain_test = y_rain[train_size + val_size:]


early_stop = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(
    X_train,
    {'tp_amount': y_train, 'rain_prob': y_rain_train},
    validation_data=(X_val, {'tp_amount': y_val, 'rain_prob': y_rain_val}),
    epochs=20,
    batch_size=16,
    callbacks=[early_stop]
)
