In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e3/sample_submission.csv
/kaggle/input/playground-series-s5e3/train.csv
/kaggle/input/playground-series-s5e3/test.csv


In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

train_data_rt = "/kaggle/input/playground-series-s5e3/train.csv"
test_data_rt = "/kaggle/input/playground-series-s5e3/test.csv"

In [3]:
import torch.nn as nn
import torch

# Initializing The model

class RainfallClassifierModel(nn.Module):
    def __init__(self):
        super(RainfallClassifierModel, self).__init__()
        self.base_layers = nn.Sequential(
            nn.Linear(54, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            # nn.Dropout(0.2),  # 20% dropout
    
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            # nn.Dropout(0.3),

            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            # nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            # nn.Dropout(0.2),

            nn.Linear(256, 1)
)
        self.flatten = nn.Flatten()
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.base_layers(x)

        return x

In [4]:
# Definfing the feature engineering process

def feature_engineering(df):
    """
    Create new features based on meteorological understanding and data analysis,
    with 'day' representing day of the year (1-365).
    Ensures no data leakage by avoiding use of the target variable (rainfall).
    """
    # Make a copy to avoid modifying the original dataframe
    enhanced_df = df.copy()
    
    # 1. temparature range (difference between max and min temparatures)
    enhanced_df['temp_range'] = enhanced_df['maxtemp'] - enhanced_df['mintemp']
    
    # 2. Dew point depression (difference between temparature and dew point)
    enhanced_df['dewpoint_depression'] = enhanced_df['temparature'] - enhanced_df['dewpoint']
    
    # 3. Pressure change from previous day
    enhanced_df['pressure_change'] = enhanced_df['pressure'].diff().fillna(0)
    
    # 4. Humidity to dew point ratio
    enhanced_df['humidity_dewpoint_ratio'] = enhanced_df['humidity'] / enhanced_df['dewpoint'].clip(lower=0.1)
    
    # 5. Cloud coverage to sunshine ratio (inverse relationship)
    enhanced_df['cloud_sunshine_ratio'] = enhanced_df['cloud'] / enhanced_df['sunshine'].clip(lower=0.1)
    
    # 6. Wind intensity factor (combination of speed and humidity)
    enhanced_df['wind_humidity_factor'] = enhanced_df['windspeed'] * (enhanced_df['humidity'] / 100)
    
    # 7. temparature-humidity index (simple version of heat index)
    enhanced_df['temp_humidity_index'] = (0.8 * enhanced_df['temparature']) + \
                                        ((enhanced_df['humidity'] / 100) * \
                                        (enhanced_df['temparature'] - 14.3)) + 46.4
    
    # 8. Pressure change rate (acceleration)
    enhanced_df['pressure_acceleration'] = enhanced_df['pressure_change'].diff().fillna(0)
    
    # 9. Seasonal features (based on day of year)
    # Convert day to month (1-365 to 1-12)
    enhanced_df['month'] = ((enhanced_df['day'] - 1) // 30) + 1
    enhanced_df['month'] = enhanced_df['month'].clip(upper=12)  # Ensure month doesn't exceed 12
    
    # 10. Convert day to season (1-365 to 1-4)
    enhanced_df['season'] = ((enhanced_df['month'] - 1) // 3) + 1
    
    # 11. Sine and cosine transformations to capture cyclical nature of days in a year
    enhanced_df['day_of_year_sin'] = np.sin(2 * np.pi * enhanced_df['day'] / 365)
    enhanced_df['day_of_year_cos'] = np.cos(2 * np.pi * enhanced_df['day'] / 365)
    
    # 12. Rolling averages for key meteorological variables
    for window in [3, 7, 14]:
        enhanced_df[f'temparature_rolling_{window}d'] = enhanced_df['temparature'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'pressure_rolling_{window}d'] = enhanced_df['pressure'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'humidity_rolling_{window}d'] = enhanced_df['humidity'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'cloud_rolling_{window}d'] = enhanced_df['cloud'].rolling(window=window, min_periods=1).mean()
        enhanced_df[f'windspeed_rolling_{window}d'] = enhanced_df['windspeed'].rolling(window=window, min_periods=1).mean()
    
    # 13. Weather pattern change features
    # temparature trend
    enhanced_df['temp_trend_3d'] = enhanced_df['temparature'].diff(3).fillna(0)
    # Pressure trend
    enhanced_df['pressure_trend_3d'] = enhanced_df['pressure'].diff(3).fillna(0)
    # Humidity trend
    enhanced_df['humidity_trend_3d'] = enhanced_df['humidity'].diff(3).fillna(0)
    
    # 14. Extreme weather indicators
    enhanced_df['extreme_temp'] = (enhanced_df['temparature'] > enhanced_df['temparature'].quantile(0.95)) | \
                                 (enhanced_df['temparature'] < enhanced_df['temparature'].quantile(0.05))
    enhanced_df['extreme_temp'] = enhanced_df['extreme_temp'].astype(int)
    
    enhanced_df['extreme_humidity'] = (enhanced_df['humidity'] > enhanced_df['humidity'].quantile(0.95)) | \
                                     (enhanced_df['humidity'] < enhanced_df['humidity'].quantile(0.05))
    enhanced_df['extreme_humidity'] = enhanced_df['extreme_humidity'].astype(int)
    
    enhanced_df['extreme_pressure'] = (enhanced_df['pressure'] > enhanced_df['pressure'].quantile(0.95)) | \
                                     (enhanced_df['pressure'] < enhanced_df['pressure'].quantile(0.05))
    enhanced_df['extreme_pressure'] = enhanced_df['extreme_pressure'].astype(int)
    
    # 15. Interaction terms between key variables
    enhanced_df['temp_humidity_interaction'] = enhanced_df['temparature'] * enhanced_df['humidity']
    enhanced_df['pressure_wind_interaction'] = enhanced_df['pressure'] * enhanced_df['windspeed']
    enhanced_df['cloud_sunshine_interaction'] = enhanced_df['cloud'] * enhanced_df['sunshine']
    enhanced_df['dewpoint_humidity_interaction'] = enhanced_df['dewpoint'] * enhanced_df['humidity']
    
    # 16. Moving standard deviations for measuring variability
    for window in [7, 14]:
        enhanced_df[f'temp_std_{window}d'] = enhanced_df['temparature'].rolling(window=window, min_periods=4).std().fillna(0)
        enhanced_df[f'pressure_std_{window}d'] = enhanced_df['pressure'].rolling(window=window, min_periods=4).std().fillna(0)
        enhanced_df[f'humidity_std_{window}d'] = enhanced_df['humidity'].rolling(window=window, min_periods=4).std().fillna(0)
    
    return enhanced_df

In [5]:
train_data = pd.read_csv(train_data_rt)
test_data = pd.read_csv(test_data_rt)
indices = test_data['id']
test_data = test_data.drop(columns = ["id"])

train_data_fe = feature_engineering(train_data)
test_data_fe = feature_engineering(test_data).to_numpy()

In [6]:
# Creating the custom dataloader for loading the data

from torch.utils.data import Dataset, DataLoader

class CSVDataset(Dataset):
    def __init__(self, df):
        self.data = df.drop(columns = ["id", "rainfall"]).values # In order to load them as a numpy array
        self.target = df["rainfall"].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features = torch.tensor(self.data[idx, :], dtype = torch.float32)
        labels = torch.tensor(self.target[idx], dtype = torch.float32)

        return features, labels

In [7]:
# Initializing the model and other functionalities
import torch.optim as optim

model = RainfallClassifierModel()
optimizer = optim.SGD(model.parameters(), lr = 0.0001)
criterion = nn.BCEWithLogitsLoss()

dataset = CSVDataset(train_data_fe)
dataloader = DataLoader(dataset, batch_size = 10, shuffle = True)

In [8]:
# Function to initialize weights

import torch.nn.init as init

def initialize_weights(model, init_type="xavier"):
    for m in model.modules():
        if isinstance(m, nn.Linear):  # Apply to linear layers
            if init_type == "xavier":
                init.xavier_uniform_(m.weight)  # Xavier initialization
            elif init_type == "he":
                init.kaiming_uniform_(m.weight, nonlinearity='relu')  # He initialization
            elif init_type == "orthogonal":
                init.orthogonal_(m.weight)  # Orthogonal initialization
            else:
                raise ValueError("Unknown initialization type")
            if m.bias is not None:
                init.zeros_(m.bias)  # Initialize bias to zero

initialize_weights(model, init_type = "he")

In [9]:
# Training Code

num_of_epochs = 50

for i in range(num_of_epochs):
    model.train()
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        y_pred = model(batch_x).squeeze(1)
        
        loss = criterion(y_pred, batch_y.float())  # BCE loss
        
        loss.backward()
        optimizer.step()
    print(f"Epoch : {i} --------- Loss : {loss}")

Epoch : 0 --------- Loss : 0.5543626546859741
Epoch : 1 --------- Loss : 0.9438101649284363
Epoch : 2 --------- Loss : 0.5250009298324585
Epoch : 3 --------- Loss : 0.6572718024253845
Epoch : 4 --------- Loss : 0.5554602742195129
Epoch : 5 --------- Loss : 0.8258687257766724
Epoch : 6 --------- Loss : 0.6683531999588013
Epoch : 7 --------- Loss : 0.5981622338294983
Epoch : 8 --------- Loss : 0.7549988031387329
Epoch : 9 --------- Loss : 0.58006352186203
Epoch : 10 --------- Loss : 0.6024667620658875
Epoch : 11 --------- Loss : 0.6316583156585693
Epoch : 12 --------- Loss : 0.7649912238121033
Epoch : 13 --------- Loss : 0.6327900886535645
Epoch : 14 --------- Loss : 0.6328426599502563
Epoch : 15 --------- Loss : 0.7069187760353088
Epoch : 16 --------- Loss : 0.5855089426040649
Epoch : 17 --------- Loss : 0.5871124863624573
Epoch : 18 --------- Loss : 0.5200592279434204
Epoch : 19 --------- Loss : 0.628982424736023
Epoch : 20 --------- Loss : 0.7118790149688721
Epoch : 21 --------- Loss 

In [10]:
# Prediction code

X_test_tensor = torch.tensor(test_data_fe, dtype=torch.float32)

model.eval()

with torch.no_grad():  # No gradients needed for inference
    logits = model(X_test_tensor)  # Forward pass
    probabilities = torch.sigmoid(logits)  # Apply Sigmoid (for binary classification)

# Convert to binary labels (0 or 1) based on threshold 0.5
predictions = (probabilities > 0.5).int()

probabilities = torch.nan_to_num(probabilities, nan=0.5)

print(torch.isnan(probabilities).sum())  # Count NaN values
print(torch.isinf(probabilities).sum())

tensor(0)
tensor(0)


In [11]:
# Concatenation

final_dataframe = np.concatenate((np.array(indices).reshape(-1, 1), probabilities.numpy()), axis = 1)
submission = pd.DataFrame(final_dataframe, columns = ["id", "rainfall"])
submission["id"] = submission["id"].astype(int)
submission.to_csv("submission.csv", index = False)

submission.isnull().sum()

id          0
rainfall    0
dtype: int64