In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e3/sample_submission.csv
/kaggle/input/playground-series-s5e3/train.csv
/kaggle/input/playground-series-s5e3/test.csv


In [14]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

train_data_rt = "/kaggle/input/playground-series-s5e3/train.csv"
test_data_rt = "/kaggle/input/playground-series-s5e3/test.csv"

In [15]:
import torch.nn as nn
import torch

# Initializing The model

class RainfallClassifierModel(nn.Module):
    def __init__(self):
        super(RainfallClassifierModel, self).__init__()
        self.base_layers = nn.Sequential(
            nn.Linear(54, 128),
            nn.ELU(),
            nn.BatchNorm1d(128),
            # nn.Dropout(0.2),  # 20% dropout
    
            # nn.Linear(128, 128),
            # nn.ELU(),
            # nn.BatchNorm1d(128),
            # nn.Dropout(0.2),

            nn.Linear(128, 1)
)
        self.flatten = nn.Flatten()
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.base_layers(x)

        return x

In [16]:
# Definfing the feature engineering process

def feature_engineering(df):
    """
    Create new features based on meteorological understanding and data analysis,
    with 'day' representing day of the year (1-365).
    Ensures no data leakage by avoiding use of the target variable (rainfall).
    """
    # Make a copy to avoid modifying the original dataframe
    enhanced_df = df.copy()
    
    # 1. temparature range (difference between max and min temparatures)
    enhanced_df['temp_range'] = enhanced_df['maxtemp'] - enhanced_df['mintemp']
    
    # 2. Dew point depression (difference between temparature and dew point)
    enhanced_df['dewpoint_depression'] = enhanced_df['temparature'] - enhanced_df['dewpoint']
    
    # 3. Pressure change from previous day
    enhanced_df['pressure_change'] = enhanced_df['pressure'].diff().fillna(0)
    
    # 4. Humidity to dew point ratio
    enhanced_df['humidity_dewpoint_ratio'] = enhanced_df['humidity'] / enhanced_df['dewpoint'].clip(lower=0.1)
    
    # 5. Cloud coverage to sunshine ratio (inverse relationship)
    enhanced_df['cloud_sunshine_ratio'] = enhanced_df['cloud'] / enhanced_df['sunshine'].clip(lower=0.1)
    
    # 6. Wind intensity factor (combination of speed and humidity)
    enhanced_df['wind_humidity_factor'] = enhanced_df['windspeed'] * (enhanced_df['humidity'] / 100)
    
    # 7. temparature-humidity index (simple version of heat index)
    enhanced_df['temp_humidity_index'] = (0.8 * enhanced_df['temparature']) + \
                                        ((enhanced_df['humidity'] / 100) * \
                                        (enhanced_df['temparature'] - 14.3)) + 46.4
    
    # 8. Pressure change rate (acceleration)
    enhanced_df['pressure_acceleration'] = enhanced_df['pressure_change'].diff().fillna(0)
    
    # 9. Seasonal features (based on day of year)
    # Convert day to month (1-365 to 1-12)
    enhanced_df['month'] = ((enhanced_df['day'] - 1) // 30) + 1
    enhanced_df['month'] = enhanced_df['month'].clip(upper=12)  # Ensure month doesn't exceed 12
    
    # 10. Convert day to season (1-365 to 1-4)
    enhanced_df['season'] = ((enhanced_df['month'] - 1) // 3) + 1
    
    # 11. Sine and cosine transformations to capture cyclical nature of days in a year
    enhanced_df['day_of_year_sin'] = np.sin(2 * np.pi * enhanced_df['day'] / 365)
    enhanced_df['day_of_year_cos'] = np.cos(2 * np.pi * enhanced_df['day'] / 365)
    
    # 12. Rolling averages for key meteorological variables
    for window in [3, 7, 14]:
        enhanced_df[f'temparature_rolling_{window}d'] = enhanced_df['temparature'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'pressure_rolling_{window}d'] = enhanced_df['pressure'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'humidity_rolling_{window}d'] = enhanced_df['humidity'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'cloud_rolling_{window}d'] = enhanced_df['cloud'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'windspeed_rolling_{window}d'] = enhanced_df['windspeed'].rolling(window=window, min_periods=1).mean()
    
    # 13. Weather pattern change features
    # temparature trend
    enhanced_df['temp_trend_3d'] = enhanced_df['temparature'].diff(3).fillna(0)
    # Pressure trend
    enhanced_df['pressure_trend_3d'] = enhanced_df['pressure'].diff(3).fillna(0)
    # Humidity trend
    enhanced_df['humidity_trend_3d'] = enhanced_df['humidity'].diff(3).fillna(0)
    
    # 14. Extreme weather indicators
    enhanced_df['extreme_temp'] = (enhanced_df['temparature'] > enhanced_df['temparature'].quantile(0.95)) | \
                                 (enhanced_df['temparature'] < enhanced_df['temparature'].quantile(0.05))
    enhanced_df['extreme_temp'] = enhanced_df['extreme_temp'].astype(int)
    
    enhanced_df['extreme_humidity'] = (enhanced_df['humidity'] > enhanced_df['humidity'].quantile(0.95)) | \
                                     (enhanced_df['humidity'] < enhanced_df['humidity'].quantile(0.05))
    enhanced_df['extreme_humidity'] = enhanced_df['extreme_humidity'].astype(int)
    
    enhanced_df['extreme_pressure'] = (enhanced_df['pressure'] > enhanced_df['pressure'].quantile(0.95)) | \
                                     (enhanced_df['pressure'] < enhanced_df['pressure'].quantile(0.05))
    enhanced_df['extreme_pressure'] = enhanced_df['extreme_pressure'].astype(int)
    
    # 15. Interaction terms between key variables
    enhanced_df['temp_humidity_interaction'] = enhanced_df['temparature'] * enhanced_df['humidity']
    enhanced_df['pressure_wind_interaction'] = enhanced_df['pressure'] * enhanced_df['windspeed']
    enhanced_df['cloud_sunshine_interaction'] = enhanced_df['cloud'] * enhanced_df['sunshine']
    enhanced_df['dewpoint_humidity_interaction'] = enhanced_df['dewpoint'] * enhanced_df['humidity']
    
    # 16. Moving standard deviations for measuring variability
    for window in [7, 14]:
        enhanced_df[f'temp_std_{window}d'] = enhanced_df['temparature'].rolling(window=window, min_periods=4).std().fillna(0)
        enhanced_df[f'pressure_std_{window}d'] = enhanced_df['pressure'].rolling(window=window, min_periods=4).std().fillna(0)
        enhanced_df[f'humidity_std_{window}d'] = enhanced_df['humidity'].rolling(window=window, min_periods=4).std().fillna(0)
    
    return enhanced_df

In [17]:
train_data = pd.read_csv(train_data_rt)
test_data = pd.read_csv(test_data_rt)
indices = test_data['id']
test_data = test_data.drop(columns = ["id"])

train_data_fe = feature_engineering(train_data)
test_data_fe = feature_engineering(test_data).to_numpy()

In [18]:
# Creating the custom dataloader for loading the data

from torch.utils.data import Dataset, DataLoader

class CSVDataset(Dataset):
    def __init__(self, df):
        self.data = df.drop(columns = ["id", "rainfall"]).values # In order to load them as a numpy array
        self.target = df["rainfall"].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features = torch.tensor(self.data[idx, :], dtype = torch.float32)
        labels = torch.tensor(self.target[idx], dtype = torch.float32)

        return features, labels

In [19]:
# Initializing the model and other functionalities
import torch.optim as optim

# model = RainfallClassifierModel()
# optimizer = optim.Adam(model.parameters(), lr = 0.0001)
# criterion = nn.BCEWithLogitsLoss()

dataset = CSVDataset(train_data_fe)
# dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)

In [20]:
# Function to initialize weights

import torch.nn.init as init

def initialize_weights(model, init_type="xavier"):
    for m in model.modules():
        if isinstance(m, nn.Linear):  # Apply to linear layers
            if init_type == "xavier":
                init.xavier_uniform_(m.weight)  # Xavier initialization
            elif init_type == "he":
                init.kaiming_uniform_(m.weight, nonlinearity='relu')  # He initialization
            elif init_type == "orthogonal":
                init.orthogonal_(m.weight)  # Orthogonal initialization
            else:
                raise ValueError("Unknown initialization type")
            if m.bias is not None:
                init.zeros_(m.bias)  # Initialize bias to zero


In [21]:
from sklearn.metrics import roc_auc_score
from torch.utils.data import Subset
from sklearn.model_selection import KFold
from tqdm import tqdm

# Hyperparameters
# input_dim = len(features)
epochs = 50
learning_rate = 0.000035
k_folds = 5

# Cross-validation and training
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
roc_auc_scores = []
models = []

for fold, (train_index, val_index) in enumerate(kf.split(dataset)):
    print(f"Fold {fold + 1}")
    data_train = Subset(dataset, train_index)
    data_val = Subset(dataset, val_index)

    train_subset = DataLoader(data_train, batch_size = 32, shuffle = True)
    val_subset = DataLoader(data_val, batch_size = 32, shuffle = True)

    model = RainfallClassifierModel()
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    # Initializing the weights
    initialize_weights(model, init_type = "he")

    # Early Stopping
    best_roc_auc = 0
    patience = 50
    no_improvement_count = 0
    roc_history = [] 
    for epoch in tqdm(range(epochs)):
        for batch_x, batch_y in train_subset:
        # Training
            model.train()
            optimizer.zero_grad()
            outputs = model(batch_x).squeeze(1)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        for val_x, val_y in val_subset:
            with torch.no_grad():
                val_outputs = model(val_x).squeeze(1)
                roc_auc = roc_auc_score(val_y.numpy(), val_outputs.numpy())
                roc_history.append(roc_auc)   
            if roc_auc > best_roc_auc:
                best_roc_auc = roc_auc
                no_improvement_count = 0
            else:
                no_improvement_count +=1
                if no_improvement_count >= patience:
                    print(f"Early stopping at epoch {epoch} with {best_roc_auc = }")
                    break

    roc_auc_scores.append(best_roc_auc)
    if best_roc_auc > 0.95 :
        models.append(model)
print(f"Average ROC AUC across folds: {np.mean(roc_auc_scores):.4f}")

Fold 1


 18%|█▊        | 9/50 [00:01<00:05,  6.86it/s]

Early stopping at epoch 7 with best_roc_auc = 0.7135416666666667
Early stopping at epoch 8 with best_roc_auc = 0.7135416666666667


 22%|██▏       | 11/50 [00:01<00:05,  7.26it/s]

Early stopping at epoch 9 with best_roc_auc = 0.7135416666666667
Early stopping at epoch 10 with best_roc_auc = 0.7135416666666667


 26%|██▌       | 13/50 [00:01<00:05,  6.99it/s]

Early stopping at epoch 11 with best_roc_auc = 0.7135416666666667


 38%|███▊      | 19/50 [00:02<00:04,  7.03it/s]

Early stopping at epoch 17 with best_roc_auc = 0.890909090909091
Early stopping at epoch 18 with best_roc_auc = 0.890909090909091


 42%|████▏     | 21/50 [00:03<00:03,  7.38it/s]

Early stopping at epoch 19 with best_roc_auc = 0.890909090909091
Early stopping at epoch 20 with best_roc_auc = 0.890909090909091


 46%|████▌     | 23/50 [00:03<00:03,  7.55it/s]

Early stopping at epoch 21 with best_roc_auc = 0.890909090909091
Early stopping at epoch 22 with best_roc_auc = 0.890909090909091


 50%|█████     | 25/50 [00:03<00:03,  7.53it/s]

Early stopping at epoch 23 with best_roc_auc = 0.890909090909091
Early stopping at epoch 24 with best_roc_auc = 0.890909090909091


 54%|█████▍    | 27/50 [00:03<00:03,  7.19it/s]

Early stopping at epoch 25 with best_roc_auc = 0.890909090909091
Early stopping at epoch 26 with best_roc_auc = 0.890909090909091


 58%|█████▊    | 29/50 [00:04<00:02,  7.42it/s]

Early stopping at epoch 27 with best_roc_auc = 0.890909090909091
Early stopping at epoch 28 with best_roc_auc = 0.890909090909091


 62%|██████▏   | 31/50 [00:04<00:02,  7.57it/s]

Early stopping at epoch 29 with best_roc_auc = 0.890909090909091
Early stopping at epoch 30 with best_roc_auc = 0.890909090909091


 66%|██████▌   | 33/50 [00:04<00:02,  7.21it/s]

Early stopping at epoch 31 with best_roc_auc = 0.890909090909091


 94%|█████████▍| 47/50 [00:06<00:00,  6.81it/s]

Early stopping at epoch 45 with best_roc_auc = 0.9590909090909092
Early stopping at epoch 46 with best_roc_auc = 0.9590909090909092


 98%|█████████▊| 49/50 [00:07<00:00,  7.22it/s]

Early stopping at epoch 47 with best_roc_auc = 0.9590909090909092
Early stopping at epoch 48 with best_roc_auc = 0.9590909090909092


100%|██████████| 50/50 [00:07<00:00,  6.83it/s]


Early stopping at epoch 49 with best_roc_auc = 0.9590909090909092
Fold 2


 24%|██▍       | 12/50 [00:01<00:05,  6.58it/s]

Early stopping at epoch 10 with best_roc_auc = 0.7767857142857143
Early stopping at epoch 11 with best_roc_auc = 0.7767857142857143


 28%|██▊       | 14/50 [00:02<00:05,  6.62it/s]

Early stopping at epoch 12 with best_roc_auc = 0.7767857142857143
Early stopping at epoch 13 with best_roc_auc = 0.7767857142857143


 32%|███▏      | 16/50 [00:02<00:05,  6.09it/s]

Early stopping at epoch 14 with best_roc_auc = 0.7767857142857143


 46%|████▌     | 23/50 [00:03<00:04,  6.24it/s]

Early stopping at epoch 21 with best_roc_auc = 1.0
Early stopping at epoch 22 with best_roc_auc = 1.0


 50%|█████     | 25/50 [00:04<00:03,  6.91it/s]

Early stopping at epoch 23 with best_roc_auc = 1.0
Early stopping at epoch 24 with best_roc_auc = 1.0


 54%|█████▍    | 27/50 [00:04<00:03,  7.36it/s]

Early stopping at epoch 25 with best_roc_auc = 1.0
Early stopping at epoch 26 with best_roc_auc = 1.0


 58%|█████▊    | 29/50 [00:04<00:02,  7.53it/s]

Early stopping at epoch 27 with best_roc_auc = 1.0
Early stopping at epoch 28 with best_roc_auc = 1.0


 62%|██████▏   | 31/50 [00:04<00:02,  7.66it/s]

Early stopping at epoch 29 with best_roc_auc = 1.0
Early stopping at epoch 30 with best_roc_auc = 1.0


 66%|██████▌   | 33/50 [00:05<00:02,  7.64it/s]

Early stopping at epoch 31 with best_roc_auc = 1.0
Early stopping at epoch 32 with best_roc_auc = 1.0


 70%|███████   | 35/50 [00:05<00:01,  7.62it/s]

Early stopping at epoch 33 with best_roc_auc = 1.0
Early stopping at epoch 34 with best_roc_auc = 1.0


 74%|███████▍  | 37/50 [00:05<00:01,  7.67it/s]

Early stopping at epoch 35 with best_roc_auc = 1.0
Early stopping at epoch 36 with best_roc_auc = 1.0


 78%|███████▊  | 39/50 [00:05<00:01,  7.73it/s]

Early stopping at epoch 37 with best_roc_auc = 1.0
Early stopping at epoch 38 with best_roc_auc = 1.0


 82%|████████▏ | 41/50 [00:06<00:01,  7.71it/s]

Early stopping at epoch 39 with best_roc_auc = 1.0
Early stopping at epoch 40 with best_roc_auc = 1.0


 86%|████████▌ | 43/50 [00:06<00:00,  7.34it/s]

Early stopping at epoch 41 with best_roc_auc = 1.0
Early stopping at epoch 42 with best_roc_auc = 1.0


 90%|█████████ | 45/50 [00:06<00:00,  7.27it/s]

Early stopping at epoch 43 with best_roc_auc = 1.0
Early stopping at epoch 44 with best_roc_auc = 1.0


 94%|█████████▍| 47/50 [00:06<00:00,  7.50it/s]

Early stopping at epoch 45 with best_roc_auc = 1.0
Early stopping at epoch 46 with best_roc_auc = 1.0


 98%|█████████▊| 49/50 [00:07<00:00,  7.60it/s]

Early stopping at epoch 47 with best_roc_auc = 1.0
Early stopping at epoch 48 with best_roc_auc = 1.0


100%|██████████| 50/50 [00:07<00:00,  6.82it/s]


Early stopping at epoch 49 with best_roc_auc = 1.0
Fold 3


 24%|██▍       | 12/50 [00:01<00:05,  6.82it/s]

Early stopping at epoch 10 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 11 with best_roc_auc = 0.9028571428571429


 28%|██▊       | 14/50 [00:02<00:04,  7.33it/s]

Early stopping at epoch 12 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 13 with best_roc_auc = 0.9028571428571429


 32%|███▏      | 16/50 [00:02<00:04,  7.56it/s]

Early stopping at epoch 14 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 15 with best_roc_auc = 0.9028571428571429


 36%|███▌      | 18/50 [00:02<00:04,  7.76it/s]

Early stopping at epoch 16 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 17 with best_roc_auc = 0.9028571428571429


 40%|████      | 20/50 [00:02<00:03,  7.86it/s]

Early stopping at epoch 18 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 19 with best_roc_auc = 0.9028571428571429


 44%|████▍     | 22/50 [00:03<00:03,  7.93it/s]

Early stopping at epoch 20 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 21 with best_roc_auc = 0.9028571428571429


 48%|████▊     | 24/50 [00:03<00:03,  7.91it/s]

Early stopping at epoch 22 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 23 with best_roc_auc = 0.9028571428571429


 52%|█████▏    | 26/50 [00:03<00:03,  7.88it/s]

Early stopping at epoch 24 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 25 with best_roc_auc = 0.9028571428571429


 56%|█████▌    | 28/50 [00:03<00:02,  7.82it/s]

Early stopping at epoch 26 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 27 with best_roc_auc = 0.9028571428571429


 60%|██████    | 30/50 [00:04<00:02,  7.74it/s]

Early stopping at epoch 28 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 29 with best_roc_auc = 0.9028571428571429


 64%|██████▍   | 32/50 [00:04<00:02,  7.62it/s]

Early stopping at epoch 30 with best_roc_auc = 0.9028571428571429
Early stopping at epoch 31 with best_roc_auc = 0.9028571428571429


 68%|██████▊   | 34/50 [00:04<00:02,  7.00it/s]

Early stopping at epoch 32 with best_roc_auc = 0.9028571428571429


 78%|███████▊  | 39/50 [00:05<00:01,  6.85it/s]

Early stopping at epoch 37 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 38 with best_roc_auc = 0.9485714285714285


 82%|████████▏ | 41/50 [00:05<00:01,  7.23it/s]

Early stopping at epoch 39 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 40 with best_roc_auc = 0.9485714285714285


 86%|████████▌ | 43/50 [00:06<00:00,  7.45it/s]

Early stopping at epoch 41 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 42 with best_roc_auc = 0.9485714285714285


 90%|█████████ | 45/50 [00:06<00:00,  7.54it/s]

Early stopping at epoch 43 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 44 with best_roc_auc = 0.9485714285714285


 94%|█████████▍| 47/50 [00:06<00:00,  7.61it/s]

Early stopping at epoch 45 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 46 with best_roc_auc = 0.9485714285714285


 98%|█████████▊| 49/50 [00:06<00:00,  7.65it/s]

Early stopping at epoch 47 with best_roc_auc = 0.9485714285714285
Early stopping at epoch 48 with best_roc_auc = 0.9485714285714285


100%|██████████| 50/50 [00:06<00:00,  7.23it/s]


Early stopping at epoch 49 with best_roc_auc = 0.9485714285714285
Fold 4


  0%|          | 0/50 [00:00<?, ?it/s]


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
# Training and Validation Code

# from sklearn.metrics import roc_auc_score
# from torch.utils.data import Subset
# from sklearn.model_selection import KFold

# num_of_epochs = 50

# for i in range(num_of_epochs):
#     model.train()
#     for batch_x, batch_y in dataloader:
#         optimizer.zero_grad()
#         y_pred = model(batch_x).squeeze(1)
        
#         loss = criterion(y_pred, batch_y.float())  # BCE loss
        
#         loss.backward()
#         optimizer.step()
#     print(f"Epoch : {i} --------- Loss : {loss}")

In [None]:
# Prediction code

X_test_tensor = torch.tensor(test_data_fe, dtype=torch.float32)
test_predictions = np.zeros ((X_test_tensor.shape [0]))

for model in models:
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)
        test_predictions = np.add(predictions.numpy().flatten(), test_predictions)
        
test_predictions = test_predictions / len(models)
clean_test_predictions = np.nan_to_num(test_predictions, nan=0.8)

# model.eval()

# with torch.no_grad():  # No gradients needed for inference
#     logits = model(X_test_tensor)  # Forward pass
#     probabilities = torch.sigmoid(logits)  # Apply Sigmoid (for binary classification)

# probabilities = torch.nan_to_num(probabilities, nan=0.5)

# print(torch.isnan(probabilities).sum())  # Count NaN values
# print(torch.isinf(probabilities).sum())

In [None]:
# Concatenation

final_dataframe = np.concatenate((np.array(indices).reshape(-1, 1), clean_test_predictions.reshape(-1, 1)), axis = 1)
submission = pd.DataFrame(final_dataframe, columns = ["id", "rainfall"])
submission["id"] = submission["id"].astype(int)
submission.to_csv("submission.csv", index = False)

submission.isnull().sum()