# Binary Prediction with a Rainfall Dataset
By Josh Houlding

Competition page: [https://www.kaggle.com/competitions/playground-series-s5e3/data](https://www.kaggle.com/competitions/playground-series-s5e3/data) <br>
Extra training data: [https://www.kaggle.com/datasets/subho117/rainfall-prediction-using-machine-learning](https://www.kaggle.com/datasets/subho117/rainfall-prediction-using-machine-learning)

## Loading the data

In [1]:
import pandas as pd

# Load data files
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Show samples from each data file
print(f"TRAINING SET \n Dataset shape: {df_train.shape} \n Dataset sample: \n {df_train.sample(5, random_state=42)} \n")
print(f"TEST SET \n Dataset shape: {df_test.shape} \n Dataset sample: \n {df_test.sample(5, random_state=42)} \n")
print(f"Sample submission: \n {sample_submission.sample(3, random_state=42)}")

TRAINING SET 
 Dataset shape: (2190, 13) 
 Dataset sample: 
         id  day  pressure  maxtemp  temparature  mintemp  dewpoint  humidity  \
289    289  290    1014.9     26.3         23.3     20.1      17.4      75.0   
1692  1692  233    1009.5     31.0         29.8     28.2      24.5      78.0   
1590  1590  131    1012.2     31.9         29.9     28.3      24.9      75.0   
383    383   19    1034.6     11.2         10.4      7.0       3.4      77.0   
1818  1818  359    1017.5     17.5         16.4     15.8      15.9      85.0   

      cloud  sunshine  winddirection  windspeed  rainfall  
289    83.0       3.0           40.0       22.0         1  
1692   78.0       7.7          220.0       26.4         1  
1590   72.0       8.4          180.0        8.0         1  
383    95.0       0.0           40.0       16.0         1  
1818   91.0       1.5           50.0       19.0         1   

TEST SET 
 Dataset shape: (730, 12) 
 Dataset sample: 
        id  day  pressure  maxtemp  tempa

## Renaming column(s)

In [2]:
# Rename misspelled column
new_column_mapping = {"temparature": "temperature"}
df_train = df_train.rename(columns=new_column_mapping)
df_test = df_test.rename(columns=new_column_mapping)

## Handling duplicates, missing values and outliers

In [3]:
# Remove duplicates
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

In [4]:
# Find missing value counts per column
print(f"Missing value counts in training set:\n {df_train.isna().sum()} \n")
print(f"Missing value counts in test set:\n {df_test.isna().sum()}")

Missing value counts in training set:
 id               0
day              0
pressure         0
maxtemp          0
temperature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64 

Missing value counts in test set:
 id               0
day              0
pressure         0
maxtemp          0
temperature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64


In [5]:
# Fill missing test winddirection value with the median
df_test["winddirection"] = df_test["winddirection"].fillna(df_test["winddirection"].median())

In [6]:
# Check missing count in test set
print(f"Rows with missing values in test set after imputation: {df_test.isna().sum().sum()}")

Rows with missing values in test set after imputation: 0


In [7]:
"""
import numpy as np

def detect_outliers_iqr_all_cols(df, threshold=1.5):
    
    # Detects outliers in all numerical columns of a DataFrame using the IQR method.

    #Args:
    #    df (pd.DataFrame): The DataFrame to check.
    #    threshold (float): The IQR multiplier (e.g., 1.5, 3).

    # Returns:
        dict: A dictionary where keys are column names and values are lists of outlier indices.
    
    outlier_indices = {}
    for col in df.select_dtypes(include=np.number).columns: # Only check numerical columns
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index.tolist()
        if outliers:
            outlier_indices[col] = outliers
    return outlier_indices
    """;

In [8]:
"""
# Find outliers in both datasets
train_outliers = detect_outliers_iqr_all_cols(df_train)
test_outliers = detect_outliers_iqr_all_cols(df_test)

def show_outliers(outlier_df):
    if outlier_df:
        for col, indices in outlier_df.items():
            print(f"Outliers in column '{col}': {indices}")
    else:
        print("No outliers found in any numerical columns.")
    
show_outliers(train_outliers)
""";

## Checking data types

In [9]:
print(f"Info for training set:")
df_train.info()
print()

print(f"Info for test set:")
df_test.info()

Info for training set:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temperature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.6 KB

Info for test set:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 12 columns):
 #   Column         Non-Null Cou

All data types are appropriate, and no conversions are necessary.

# Scaling features

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

def check_feature_distributions(df):
    """
    Checks and visualizes the distributions of numerical features in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to check.
    """
    numerical_cols = df.select_dtypes(include=np.number).columns

    for col in numerical_cols:
        plt.figure(figsize=(10, 5))

        # Histogram
        plt.subplot(1, 2, 1)
        plt.hist(df[col], bins=30)
        plt.title(f"Histogram of {col}")

        # Descriptive Statistics
        plt.subplot(1, 2, 2)
        text_str = f"Mean: {df[col].mean():.2f}\n" \
                   f"Median: {df[col].median():.2f}\n" \
                   f"Std: {df[col].std():.2f}\n" \
                   f"Skew: {stats.skew(df[col]):.2f}\n" \
                   f"Kurtosis: {stats.kurtosis(df[col]):.2f}"
        plt.text(0.1, 0.5, text_str, fontsize=12)
        plt.axis('off')  # Turn off axes for text display

        plt.tight_layout()
        plt.show()
        
check_feature_distributions(df_train)

NameError: name 'np' is not defined

Most columns follow a skewed distribution, meaning standardization and normalization are sub-optimal choices. Thus, we will use the `RobustScaler` method from `sklearn.preprocessing`.

In [None]:
# Show columns for ease of feature selection
df_train.head(0)

In [None]:
from sklearn.preprocessing import RobustScaler

# Function to apply a RobustScaler to features
def apply_robust_scaler(df, features_to_scale):
    #numerical_cols = df.select_dtypes(include=np.number).columns
    scaler = RobustScaler()
    df_scaled = df.copy()
    df_scaled[features] = scaler.fit_transform(df[features])
    return df_scaled

# Define features to scale in both datasets
features = ["pressure", "maxtemp", "temperature", "mintemp", "dewpoint", "humidity", "cloud", "sunshine", "winddirection",
           "windspeed"]

# Scale features
df_train = apply_robust_scaler(df_train, features)
df_test = apply_robust_scaler(df_test, features)

# Show sample of both sets to check results
print(f"Processed training set: \n {df_train.sample(5, random_state=42)} \n")
print(f"Processed test set: \n {df_test.sample(5, random_state=42)} \n")

## Modeling