## Rainfall prediction

### Imports

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import KNNImputer
import numpy as np
import random

# Ignore all warnings
import warnings
warnings.simplefilter("ignore")


### Load Data

In [2]:
train_df = pd.read_csv(r'C:\Users\joshw\Repos\Kaggle\playground-series-s5e3\input\train.csv')
test_df = pd.read_csv(r'C:\Users\joshw\Repos\Kaggle\playground-series-s5e3\input\test.csv')


In [3]:
train_df.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


### Preprocess

In [4]:
test_df.isna().sum()

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [5]:
#Fill missing value for winddirection, default = 5 for KNNImputer
imputer = KNNImputer(n_neighbors=5)
test_df['winddirection'] = imputer.fit_transform(test_df[['winddirection']])

### Feature Engineering

In [6]:
#Start by doing some basic/obvious ones then can expand
# Also some reasonable interactions like humidity/sunshine
def generate_features(df):
    
    #Wind direction currently means nothing in degrees, convert to cos and sin
    df['wind_x'] = df['windspeed'] * np.cos(np.radians(df['winddirection']))
    df['wind_y'] = df['windspeed'] * np.sin(np.radians(df['winddirection']))
    
    
    #Interactions
    df['humid_sun_inter'] = df['humidity']*df['sunshine']
    
    #Possible additions:
    #   wet-bulb temp
    #   saturated vapour
    #   proportions for cloud/sun, percentages
    #   bucket ranges for temperature
    
    return df

In [7]:
train_df = generate_features(train_df)
test_df = generate_features(test_df)

### Model prep

In [8]:
#Set all randoms to the same seed 
SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [9]:
X = train_df.drop(columns = ['id', 'rainfall'])
Y = train_df['rainfall']
test = test_df.drop(columns=['id'])

In [10]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)
test = scaler.transform(test)

### Modelling

In [11]:
#Prepare MLP
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(Y, dtype=torch.float32)
test = torch.tensor(test, dtype=torch.float32)
y = y.reshape(-1, 1)

In [12]:
X_width = X.shape[1]

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

k_folds = 4
kf = KFold(n_splits=k_folds, shuffle=True, random_state=SEED)

cv_scores = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold + 1}/{k_folds}')

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    train_dataset = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
    val_dataset = DataLoader(TensorDataset(X_val, y_val), batch_size=32, shuffle=False)

    model = nn.Sequential(
        nn.Linear(X_width, X_width),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(X_width, 20),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(20, 10),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(10, 5),
        nn.ReLU(),
        nn.Linear(5, 1),
        nn.Sigmoid()
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.BCELoss().to(device)

    # Training
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        for x_batch, y_batch in train_dataset:
            optimizer.zero_grad()
            pred = model(x_batch)
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    val_losses = []
    y_true, y_pred = [], []
    with torch.no_grad():
        for x_batch, y_batch in val_dataset:
            pred = model(x_batch)
            loss = loss_fn(pred, y_batch)
            val_losses.append(loss.item())
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(pred.cpu().numpy())

    # scoring
    auc_score = roc_auc_score(y_true, y_pred)
    cv_scores.append(auc_score)
    print(f'Fold {fold + 1} AUC: {auc_score:.4f}')

    models.append(model)

print(f'Cross-validated ROC AUC score: {np.mean(cv_scores):.5f} +/- {np.std(cv_scores):.5f}')


Fold 1/4
Fold 1 AUC: 0.8734
Fold 2/4
Fold 2 AUC: 0.8940
Fold 3/4
Fold 3 AUC: 0.8944
Fold 4/4
Fold 4 AUC: 0.8846
Cross-validated ROC AUC score: 0.88658 +/- 0.00858


In [19]:
test_id = test_df['id']

In [None]:
submit_score = []

for fold_, model in enumerate(models):
    pred_ = model(test)
    submit_score.append(pred_)
    
#predict test data
pred =np.mean([score.detach().cpu().numpy() for score in submit_score], axis = 0)

array([[0.980801  ],
       [0.98613536],
       [0.9486872 ],
       [0.1461074 ],
       [0.1315348 ],
       [0.8896691 ],
       [0.94101423],
       [0.9845809 ],
       [0.9600851 ],
       [0.8298796 ],
       [0.98381364],
       [0.13544157],
       [0.9812044 ],
       [0.9793018 ],
       [0.26948166],
       [0.12527554],
       [0.8738164 ],
       [0.8120552 ],
       [0.13142242],
       [0.12478831],
       [0.13740884],
       [0.22879389],
       [0.8003457 ],
       [0.98342645],
       [0.86445147],
       [0.41199937],
       [0.12614337],
       [0.9855894 ],
       [0.9309299 ],
       [0.3197224 ],
       [0.9451911 ],
       [0.96260506],
       [0.8120272 ],
       [0.96013176],
       [0.84524703],
       [0.95255065],
       [0.23818135],
       [0.9332581 ],
       [0.81787044],
       [0.9268284 ],
       [0.85710716],
       [0.8765327 ],
       [0.13493769],
       [0.93504703],
       [0.93258524],
       [0.21669297],
       [0.24420589],
       [0.971

In [22]:
sub = pd.DataFrame({
    'id': test_id,
    'rainfall': pred.flatten()
})

sub.to_csv('submissions/sub_1.csv', index=False)