In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder


In [18]:

# Load Dataset
df = pd.read_csv("weatherAUS.csv") 
df.head()


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [20]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

 ### Step-by-Step Missing Value Handling (with Explanation)
**Assumptions:**
Numerical features: median imputation

Categorical features: "Unknown" label

Columns with excessive missingness (Evaporation, Sunshine, Cloud9am, Cloud3pm) can be dropped or handled based on EDA and modeling preference

In [21]:
df.isnull().sum().sort_values(ascending=False)

Sunshine         69835
Evaporation      62790
Cloud3pm         59358
Cloud9am         55888
Pressure9am      15065
Pressure3pm      15028
WindDir9am       10566
WindGustDir      10326
WindGustSpeed    10263
Humidity3pm       4507
WindDir3pm        4228
Temp3pm           3609
RainTomorrow      3267
Rainfall          3261
RainToday         3261
WindSpeed3pm      3062
Humidity9am       2654
WindSpeed9am      1767
Temp9am           1767
MinTemp           1485
MaxTemp           1261
Date                 0
Location             0
dtype: int64

In [22]:
# Drop high-missing-value columns (optional based on project goal)
high_missing_cols = ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
df.drop(columns=high_missing_cols, inplace=True)


In [23]:
# Identify column types
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Exclude target from preprocessing
target_col = 'RainTomorrow'
cat_cols.remove(target_col)


In [25]:
# Impute missing values

# 1. Categorical - fill with 'Unknown'
df[cat_cols] = df[cat_cols].fillna('Unknown')

# 2. Numerical - fill with median
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


In [26]:
# Check missing values again
df.isnull().sum().sort_values(ascending=False)


RainTomorrow     3267
Location            0
Date                0
MinTemp             0
MaxTemp             0
WindGustDir         0
Rainfall            0
WindDir9am          0
WindDir3pm          0
WindSpeed9am        0
WindGustSpeed       0
WindSpeed3pm        0
Humidity9am         0
Pressure9am         0
Humidity3pm         0
Pressure3pm         0
Temp9am             0
Temp3pm             0
RainToday           0
dtype: int64

In [27]:
# Cell 6: Drop rows with target label missing
df.dropna(subset=['RainTomorrow'], inplace=True)

# Confirm clean dataset
print("✅ Final shape:", df.shape)
print("✅ Missing values:\n", df.isnull().sum().sum())


✅ Final shape: (142193, 19)
✅ Missing values:
 0


In [35]:
# Cell 2: Load dataset
data = pd.read_csv("weatherAUS.csv")

In [36]:
# Cell 3: Handle missing values
# Drop columns with more than 50% missing values
threshold = 0.5
missing_ratio = data.isnull().mean()
data = data.drop(columns=missing_ratio[missing_ratio > threshold].index)

# Fill numerical columns with median and categorical with mode
for col in data.columns:
    if data[col].dtype in ["float64", "int64"]:
        data[col] = data[col].fillna(data[col].median())
    elif data[col].dtype == "object":
        data[col] = data[col].fillna(data[col].mode()[0])

In [37]:
# Cell 4: Encode categorical features
label_encoders = {}
for col in data.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [38]:
# Cell 5: Define features and target
X = data.drop(columns=["RainTomorrow", "Date"])
y = data["RainTomorrow"]

In [39]:
# Cell 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# # Cell 7: Define Optuna objective function
# def objective(trial):
#     params = {
#         "objective": "binary",
#         "metric": "auc",
#         "boosting_type": "gbdt",
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 150),
#         "max_depth": trial.suggest_int("max_depth", 3, 15),
#         "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
#         "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 10.0)
#     }

#     train_set = lgb.Dataset(X_train, label=y_train)
#     valid_set = lgb.Dataset(X_test, label=y_test, reference=train_set)
# # 
#     model = lgb.train(
#         params,
#         train_set,
#         valid_sets=[valid_set],
#         num_boost_round=200,
#         early_stopping_rounds=20,
#         verbose_eval=False
#     )

#     preds = model.predict(X_test)
#     return roc_auc_score(y_test, preds)


In [55]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1
    }

    model = lgb.LGBMClassifier(**params)

    from lightgbm import early_stopping

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='auc',
        callbacks=[early_stopping(stopping_rounds=20)],
    )

    preds = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, preds)


In [56]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)


[I 2025-06-28 14:32:59,871] A new study created in memory with name: no-name-e2e91e7b-3520-464e-ba81-fd480fffa733


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:02,997] Trial 0 finished with value: 0.863422556429823 and parameters: {'learning_rate': 0.10707190082652261, 'num_leaves': 146, 'max_depth': 8, 'min_child_samples': 10, 'feature_fraction': 0.8169442622301717, 'bagging_fraction': 0.6381726025586281, 'scale_pos_weight': 7.361953218356355}. Best is trial 0 with value: 0.863422556429823.


Early stopping, best iteration is:
[5]	valid_0's auc: 0.863423	valid_0's binary_logloss: 0.472047
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:06,289] Trial 1 finished with value: 0.8699151841952526 and parameters: {'learning_rate': 0.1085019167073011, 'num_leaves': 121, 'max_depth': 9, 'min_child_samples': 87, 'feature_fraction': 0.9652996540121415, 'bagging_fraction': 0.6337189414916309, 'scale_pos_weight': 3.9969872877992585}. Best is trial 1 with value: 0.8699151841952526.


Early stopping, best iteration is:
[10]	valid_0's auc: 0.869915	valid_0's binary_logloss: 0.428435
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:07,622] Trial 2 finished with value: 0.8493328638176394 and parameters: {'learning_rate': 0.13239150973334574, 'num_leaves': 99, 'max_depth': 5, 'min_child_samples': 25, 'feature_fraction': 0.9137653274948963, 'bagging_fraction': 0.7251959523428637, 'scale_pos_weight': 7.428462754006126}. Best is trial 1 with value: 0.8699151841952526.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.849333	valid_0's binary_logloss: 0.482764
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:13,615] Trial 3 finished with value: 0.8891973397683228 and parameters: {'learning_rate': 0.1611844934007947, 'num_leaves': 48, 'max_depth': 14, 'min_child_samples': 94, 'feature_fraction': 0.7040368964233419, 'bagging_fraction': 0.6787182358178484, 'scale_pos_weight': 3.1571473279178637}. Best is trial 3 with value: 0.8891973397683228.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.889197	valid_0's binary_logloss: 0.390513
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:14,989] Trial 4 finished with value: 0.8405073565703068 and parameters: {'learning_rate': 0.1264695192520485, 'num_leaves': 97, 'max_depth': 5, 'min_child_samples': 36, 'feature_fraction': 0.5270664851938651, 'bagging_fraction': 0.569712161718775, 'scale_pos_weight': 5.786278933855647}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.840507	valid_0's binary_logloss: 0.487169
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:15,798] Trial 5 finished with value: 0.8213062463862268 and parameters: {'learning_rate': 0.14511470816882216, 'num_leaves': 114, 'max_depth': 3, 'min_child_samples': 32, 'feature_fraction': 0.697363520903822, 'bagging_fraction': 0.8143440434965179, 'scale_pos_weight': 6.675305429681064}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[2]	valid_0's auc: 0.821306	valid_0's binary_logloss: 0.492836
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:17,968] Trial 6 finished with value: 0.8550856161936607 and parameters: {'learning_rate': 0.12473721014937804, 'num_leaves': 62, 'max_depth': 12, 'min_child_samples': 73, 'feature_fraction': 0.8188301196419014, 'bagging_fraction': 0.5327815848665409, 'scale_pos_weight': 8.437164450160035}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.855086	valid_0's binary_logloss: 0.480281
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:18,969] Trial 7 finished with value: 0.8396330708057697 and parameters: {'learning_rate': 0.0685426509750403, 'num_leaves': 101, 'max_depth': 3, 'min_child_samples': 97, 'feature_fraction': 0.8293974455124926, 'bagging_fraction': 0.6378751922510266, 'scale_pos_weight': 7.154614870762839}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[5]	valid_0's auc: 0.839633	valid_0's binary_logloss: 0.495129
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:21,000] Trial 8 finished with value: 0.8544015275680049 and parameters: {'learning_rate': 0.08266965603421561, 'num_leaves': 54, 'max_depth': 14, 'min_child_samples': 76, 'feature_fraction': 0.7765152460833968, 'bagging_fraction': 0.992321209662032, 'scale_pos_weight': 9.391314552201152}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.854402	valid_0's binary_logloss: 0.489186
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:23,327] Trial 9 finished with value: 0.8553723821442782 and parameters: {'learning_rate': 0.09111128527402557, 'num_leaves': 71, 'max_depth': 9, 'min_child_samples': 38, 'feature_fraction': 0.7733120174430362, 'bagging_fraction': 0.9585760820513769, 'scale_pos_weight': 9.41837606099125}. Best is trial 3 with value: 0.8891973397683228.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.855372	valid_0's binary_logloss: 0.4873
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:27,825] Trial 10 finished with value: 0.8861823468694556 and parameters: {'learning_rate': 0.19892006987362834, 'num_leaves': 28, 'max_depth': 14, 'min_child_samples': 60, 'feature_fraction': 0.6352226849513223, 'bagging_fraction': 0.8220071580770894, 'scale_pos_weight': 1.38202742108895}. Best is trial 3 with value: 0.8891973397683228.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.886182	valid_0's binary_logloss: 0.335936
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:31,557] Trial 11 finished with value: 0.885700279153668 and parameters: {'learning_rate': 0.19727938960541352, 'num_leaves': 20, 'max_depth': 15, 'min_child_samples': 57, 'feature_fraction': 0.622119861938744, 'bagging_fraction': 0.8308886781362683, 'scale_pos_weight': 1.472303871520078}. Best is trial 3 with value: 0.8891973397683228.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.8857	valid_0's binary_logloss: 0.339452
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:35,949] Trial 12 finished with value: 0.8874797017249377 and parameters: {'learning_rate': 0.18982318347943797, 'num_leaves': 28, 'max_depth': 12, 'min_child_samples': 59, 'feature_fraction': 0.650433157152056, 'bagging_fraction': 0.7577714261312842, 'scale_pos_weight': 1.156517958511488}. Best is trial 3 with value: 0.8891973397683228.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.88748	valid_0's binary_logloss: 0.331105
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.866484	valid_0's binary_logloss: 0.423204


[I 2025-06-28 14:33:42,055] Trial 13 finished with value: 0.8664843497516802 and parameters: {'learning_rate': 0.010736213127944608, 'num_leaves': 42, 'max_depth': 12, 'min_child_samples': 96, 'feature_fraction': 0.6710496638252268, 'bagging_fraction': 0.7230583723817373, 'scale_pos_weight': 3.013485443349028}. Best is trial 3 with value: 0.8891973397683228.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:47,419] Trial 14 finished with value: 0.8886182085798394 and parameters: {'learning_rate': 0.16272997661695437, 'num_leaves': 42, 'max_depth': 12, 'min_child_samples': 70, 'feature_fraction': 0.5603076933351381, 'bagging_fraction': 0.7603371936368378, 'scale_pos_weight': 3.263589621072173}. Best is trial 3 with value: 0.8891973397683228.


Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.888618	valid_0's binary_logloss: 0.3967
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890013	valid_0's binary_logloss: 0.408209


[I 2025-06-28 14:33:55,435] Trial 15 finished with value: 0.8900129463765535 and parameters: {'learning_rate': 0.1635919068221266, 'num_leaves': 76, 'max_depth': 11, 'min_child_samples': 75, 'feature_fraction': 0.509490121303958, 'bagging_fraction': 0.9229223505765949, 'scale_pos_weight': 3.902439959494064}. Best is trial 15 with value: 0.8900129463765535.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:33:58,116] Trial 16 finished with value: 0.8613597652668861 and parameters: {'learning_rate': 0.1680564614924501, 'num_leaves': 78, 'max_depth': 10, 'min_child_samples': 82, 'feature_fraction': 0.5667577984272791, 'bagging_fraction': 0.9412561123341618, 'scale_pos_weight': 4.39584598008094}. Best is trial 15 with value: 0.8900129463765535.


Early stopping, best iteration is:
[6]	valid_0's auc: 0.86136	valid_0's binary_logloss: 0.458647
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:00,215] Trial 17 finished with value: 0.846504052372504 and parameters: {'learning_rate': 0.16482770041262224, 'num_leaves': 53, 'max_depth': 15, 'min_child_samples': 88, 'feature_fraction': 0.5164856111271185, 'bagging_fraction': 0.8882228280686002, 'scale_pos_weight': 5.0878428139170015}. Best is trial 15 with value: 0.8900129463765535.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.846504	valid_0's binary_logloss: 0.47352
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.878637	valid_0's binary_logloss: 0.383976


[I 2025-06-28 14:34:08,892] Trial 18 finished with value: 0.8786373279129484 and parameters: {'learning_rate': 0.02710736559338732, 'num_leaves': 84, 'max_depth': 11, 'min_child_samples': 69, 'feature_fraction': 0.7212618876990106, 'bagging_fraction': 0.8914816109330526, 'scale_pos_weight': 2.4306499087785807}. Best is trial 15 with value: 0.8900129463765535.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.888002	valid_0's binary_logloss: 0.363956


[I 2025-06-28 14:34:15,002] Trial 19 finished with value: 0.8880017201834862 and parameters: {'learning_rate': 0.1485426604988411, 'num_leaves': 68, 'max_depth': 7, 'min_child_samples': 96, 'feature_fraction': 0.5951433537830183, 'bagging_fraction': 0.6581741402687405, 'scale_pos_weight': 2.3846199169004962}. Best is trial 15 with value: 0.8900129463765535.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:17,596] Trial 20 finished with value: 0.8604868054685317 and parameters: {'learning_rate': 0.05972833239818373, 'num_leaves': 46, 'max_depth': 13, 'min_child_samples': 51, 'feature_fraction': 0.8997475024934161, 'bagging_fraction': 0.5026356992309531, 'scale_pos_weight': 5.367329637007633}. Best is trial 15 with value: 0.8900129463765535.


Early stopping, best iteration is:
[11]	valid_0's auc: 0.860487	valid_0's binary_logloss: 0.457905
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:22,635] Trial 21 finished with value: 0.8876324695178925 and parameters: {'learning_rate': 0.1738551041490598, 'num_leaves': 38, 'max_depth': 11, 'min_child_samples': 68, 'feature_fraction': 0.5704061394179, 'bagging_fraction': 0.7686423532397985, 'scale_pos_weight': 3.613499076956004}. Best is trial 15 with value: 0.8900129463765535.


Did not meet early stopping. Best iteration is:
[98]	valid_0's auc: 0.887632	valid_0's binary_logloss: 0.411765
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:29,692] Trial 22 finished with value: 0.8896671508847835 and parameters: {'learning_rate': 0.1524518982003396, 'num_leaves': 59, 'max_depth': 13, 'min_child_samples': 82, 'feature_fraction': 0.5298346738597237, 'bagging_fraction': 0.6957990387178639, 'scale_pos_weight': 3.14898011992209}. Best is trial 15 with value: 0.8900129463765535.


Did not meet early stopping. Best iteration is:
[98]	valid_0's auc: 0.889667	valid_0's binary_logloss: 0.3883
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:31,926] Trial 23 finished with value: 0.8488446712373339 and parameters: {'learning_rate': 0.1803877226639593, 'num_leaves': 61, 'max_depth': 14, 'min_child_samples': 83, 'feature_fraction': 0.5036036995954832, 'bagging_fraction': 0.6866554602636266, 'scale_pos_weight': 4.621503275851296}. Best is trial 15 with value: 0.8900129463765535.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.848845	valid_0's binary_logloss: 0.466848
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890453	valid_0's binary_logloss: 0.357685


[I 2025-06-28 14:34:39,769] Trial 24 finished with value: 0.8904525900447833 and parameters: {'learning_rate': 0.14276678080102123, 'num_leaves': 77, 'max_depth': 13, 'min_child_samples': 100, 'feature_fraction': 0.5965120919895215, 'bagging_fraction': 0.585180700372796, 'scale_pos_weight': 2.3657556868499405}. Best is trial 24 with value: 0.8904525900447833.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.890609	valid_0's binary_logloss: 0.353415


[I 2025-06-28 14:34:48,460] Trial 25 finished with value: 0.8906092052007555 and parameters: {'learning_rate': 0.14905335882882467, 'num_leaves': 87, 'max_depth': 10, 'min_child_samples': 78, 'feature_fraction': 0.5411754007117621, 'bagging_fraction': 0.5987392549208206, 'scale_pos_weight': 2.239021898500863}. Best is trial 25 with value: 0.8906092052007555.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:34:55,208] Trial 26 finished with value: 0.8892009501062973 and parameters: {'learning_rate': 0.13935973039884642, 'num_leaves': 87, 'max_depth': 7, 'min_child_samples': 49, 'feature_fraction': 0.6007973916250026, 'bagging_fraction': 0.5882187884166529, 'scale_pos_weight': 2.1838462070966562}. Best is trial 25 with value: 0.8906092052007555.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.889201	valid_0's binary_logloss: 0.354814
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.891242	valid_0's binary_logloss: 0.350483


[I 2025-06-28 14:35:05,802] Trial 27 finished with value: 0.8912415845804287 and parameters: {'learning_rate': 0.12135248320669911, 'num_leaves': 114, 'max_depth': 11, 'min_child_samples': 100, 'feature_fraction': 0.5512734781909172, 'bagging_fraction': 0.5929453470563474, 'scale_pos_weight': 2.19693154497164}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.89	valid_0's binary_logloss: 0.345905


[I 2025-06-28 14:35:16,470] Trial 28 finished with value: 0.8900000954970463 and parameters: {'learning_rate': 0.11519742515511108, 'num_leaves': 122, 'max_depth': 10, 'min_child_samples': 100, 'feature_fraction': 0.5814770200938921, 'bagging_fraction': 0.5883983892534091, 'scale_pos_weight': 1.9929557030222549}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.889637	valid_0's binary_logloss: 0.341867


[I 2025-06-28 14:35:25,810] Trial 29 finished with value: 0.8896368872524771 and parameters: {'learning_rate': 0.11646464259617707, 'num_leaves': 148, 'max_depth': 8, 'min_child_samples': 90, 'feature_fraction': 0.6653445326256823, 'bagging_fraction': 0.5600956015733292, 'scale_pos_weight': 1.8060127540400965}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890118	valid_0's binary_logloss: 0.366789


[I 2025-06-28 14:35:37,778] Trial 30 finished with value: 0.8901178797677072 and parameters: {'learning_rate': 0.09498314741486873, 'num_leaves': 136, 'max_depth': 10, 'min_child_samples': 91, 'feature_fraction': 0.5462934992063248, 'bagging_fraction': 0.6102898461893632, 'scale_pos_weight': 2.610225733449522}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890811	valid_0's binary_logloss: 0.363306


[I 2025-06-28 14:35:49,437] Trial 31 finished with value: 0.8908106936630632 and parameters: {'learning_rate': 0.09959038778684948, 'num_leaves': 136, 'max_depth': 10, 'min_child_samples': 100, 'feature_fraction': 0.5484532541027043, 'bagging_fraction': 0.6159721380749618, 'scale_pos_weight': 2.5716666025912143}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890375	valid_0's binary_logloss: 0.326277


[I 2025-06-28 14:36:01,286] Trial 32 finished with value: 0.8903747118599911 and parameters: {'learning_rate': 0.10316844717221674, 'num_leaves': 138, 'max_depth': 9, 'min_child_samples': 10, 'feature_fraction': 0.6127864696771745, 'bagging_fraction': 0.5392022018037033, 'scale_pos_weight': 1.0368716291376763}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.889686	valid_0's binary_logloss: 0.343552


[I 2025-06-28 14:36:10,179] Trial 33 finished with value: 0.8896862743400673 and parameters: {'learning_rate': 0.13432678378285928, 'num_leaves': 118, 'max_depth': 8, 'min_child_samples': 86, 'feature_fraction': 0.5494381487348168, 'bagging_fraction': 0.6298790262069425, 'scale_pos_weight': 1.8914930058627313}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.888876	valid_0's binary_logloss: 0.37694


[I 2025-06-28 14:36:20,323] Trial 34 finished with value: 0.8888758410610369 and parameters: {'learning_rate': 0.07512846779056004, 'num_leaves': 108, 'max_depth': 10, 'min_child_samples': 99, 'feature_fraction': 0.5860749354596582, 'bagging_fraction': 0.6093565876235179, 'scale_pos_weight': 2.7565785330609547}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:36:24,408] Trial 35 finished with value: 0.8719107220785873 and parameters: {'learning_rate': 0.10971618605608073, 'num_leaves': 129, 'max_depth': 13, 'min_child_samples': 91, 'feature_fraction': 0.9913634175954558, 'bagging_fraction': 0.5035298808859738, 'scale_pos_weight': 3.755078213276561}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[11]	valid_0's auc: 0.871911	valid_0's binary_logloss: 0.421352
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:36:27,237] Trial 36 finished with value: 0.8492951356140501 and parameters: {'learning_rate': 0.1251890136548014, 'num_leaves': 93, 'max_depth': 11, 'min_child_samples': 92, 'feature_fraction': 0.5471777233864723, 'bagging_fraction': 0.6660431927494637, 'scale_pos_weight': 6.12719467748197}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.849295	valid_0's binary_logloss: 0.478562
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.88261	valid_0's binary_logloss: 0.352259


[I 2025-06-28 14:36:35,177] Trial 37 finished with value: 0.8826103863412017 and parameters: {'learning_rate': 0.05688885177130647, 'num_leaves': 107, 'max_depth': 7, 'min_child_samples': 82, 'feature_fraction': 0.5002432941191552, 'bagging_fraction': 0.5513461036943905, 'scale_pos_weight': 1.7400063610724534}. Best is trial 27 with value: 0.8912415845804287.


Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:36:39,290] Trial 38 finished with value: 0.8692366673756806 and parameters: {'learning_rate': 0.10054550291337781, 'num_leaves': 141, 'max_depth': 9, 'min_child_samples': 77, 'feature_fraction': 0.6865921486164219, 'bagging_fraction': 0.5864288892274069, 'scale_pos_weight': 4.537454974504122}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[11]	valid_0's auc: 0.869237	valid_0's binary_logloss: 0.447448
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:46,725] Trial 39 finished with value: 0.8901238740966944 and parameters: {'learning_rate': 0.15262773804465224, 'num_leaves': 128, 'max_depth': 9, 'min_child_samples': 64, 'feature_fraction': 0.6343455591360934, 'bagging_fraction': 0.7163088640783435, 'scale_pos_weight': 2.8756811962141944}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.890124	valid_0's binary_logloss: 0.370461
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:51,663] Trial 40 finished with value: 0.89077524639612 and parameters: {'learning_rate': 0.13511915422483498, 'num_leaves': 93, 'max_depth': 12, 'min_child_samples': 99, 'feature_fraction': 0.7296523113514337, 'bagging_fraction': 0.6217975338827653, 'scale_pos_weight': 3.473810485843757}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890775	valid_0's binary_logloss: 0.392363
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:54,947] Trial 41 finished with value: 0.8909315386484105 and parameters: {'learning_rate': 0.13946261511005004, 'num_leaves': 91, 'max_depth': 12, 'min_child_samples': 100, 'feature_fraction': 0.9200275198274299, 'bagging_fraction': 0.6166202515657079, 'scale_pos_weight': 3.4392551832054856}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890932	valid_0's binary_logloss: 0.390752
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:56,061] Trial 42 finished with value: 0.8692641897618372 and parameters: {'learning_rate': 0.12063759838959558, 'num_leaves': 92, 'max_depth': 12, 'min_child_samples': 94, 'feature_fraction': 0.8797780714687511, 'bagging_fraction': 0.6239126163386507, 'scale_pos_weight': 4.217368506454063}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[9]	valid_0's auc: 0.869264	valid_0's binary_logloss: 0.437643
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:58,721] Trial 43 finished with value: 0.8899938023103965 and parameters: {'learning_rate': 0.13495051505452688, 'num_leaves': 104, 'max_depth': 11, 'min_child_samples': 100, 'feature_fraction': 0.8473400339984584, 'bagging_fraction': 0.6545008844513049, 'scale_pos_weight': 3.3783303312767}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.889994	valid_0's binary_logloss: 0.389486
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:51:59,905] Trial 44 finished with value: 0.8732290794139697 and parameters: {'learning_rate': 0.08824086515075646, 'num_leaves': 112, 'max_depth': 10, 'min_child_samples': 86, 'feature_fraction': 0.9537883660932208, 'bagging_fraction': 0.5332757902974881, 'scale_pos_weight': 3.442191790674372}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[17]	valid_0's auc: 0.873229	valid_0's binary_logloss: 0.416097
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:52:00,808] Trial 45 finished with value: 0.864059453025896 and parameters: {'learning_rate': 0.1295891508230476, 'num_leaves': 98, 'max_depth': 11, 'min_child_samples': 93, 'feature_fraction': 0.768718251186364, 'bagging_fraction': 0.6096233809335927, 'scale_pos_weight': 4.952733267771223}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[6]	valid_0's auc: 0.864059	valid_0's binary_logloss: 0.454308
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 14:52:03,522] Trial 46 finished with value: 0.8906340722194009 and parameters: {'learning_rate': 0.1091527825576512, 'num_leaves': 91, 'max_depth': 12, 'min_child_samples': 78, 'feature_fraction': 0.7265194198502863, 'bagging_fraction': 0.6458424453375823, 'scale_pos_weight': 1.4608762330267555}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.890634	valid_0's binary_logloss: 0.331649
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 17:44:49,847] Trial 47 finished with value: 0.8911866394273366 and parameters: {'learning_rate': 0.11106709150342306, 'num_leaves': 127, 'max_depth': 12, 'min_child_samples': 95, 'feature_fraction': 0.7405206326620355, 'bagging_fraction': 0.6418161555932613, 'scale_pos_weight': 1.506845539602725}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.891187	valid_0's binary_logloss: 0.331528
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 17:44:53,335] Trial 48 finished with value: 0.8591583694161021 and parameters: {'learning_rate': 0.09844572554674205, 'num_leaves': 128, 'max_depth': 13, 'min_child_samples': 96, 'feature_fraction': 0.7361668576145993, 'bagging_fraction': 0.5689001920103346, 'scale_pos_weight': 8.025514999001384}. Best is trial 27 with value: 0.8912415845804287.


Early stopping, best iteration is:
[4]	valid_0's auc: 0.859158	valid_0's binary_logloss: 0.478304
Training until validation scores don't improve for 20 rounds


[I 2025-06-28 17:44:54,943] Trial 49 finished with value: 0.8764788713815551 and parameters: {'learning_rate': 0.11416645365719874, 'num_leaves': 118, 'max_depth': 4, 'min_child_samples': 88, 'feature_fraction': 0.768529480890116, 'bagging_fraction': 0.7050181331859798, 'scale_pos_weight': 1.429272346451622}. Best is trial 27 with value: 0.8912415845804287.


Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.876479	valid_0's binary_logloss: 0.350377


In [59]:
# Assuming `study` is your completed Optuna study
best_trial = study.best_trial

print("Best Trial:")
print(f"  Value (AUC): {best_trial.value}")
print("  Best Hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


Best Trial:
  Value (AUC): 0.8912415845804287
  Best Hyperparameters:
    learning_rate: 0.12135248320669911
    num_leaves: 114
    max_depth: 11
    min_child_samples: 100
    feature_fraction: 0.5512734781909172
    bagging_fraction: 0.5929453470563474
    scale_pos_weight: 2.19693154497164


In [60]:
# If you want to save these best parameters as a dictionary for later use:
best_params = best_trial.params
best_params['objective'] = 'binary'
best_params['boosting_type'] = 'gbdt'
best_params['verbosity'] = -1
best_params['n_jobs'] = -1


In [61]:
# You can then use best_params to retrain your model like this:
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train, y_train)


0,1,2
,boosting_type,'gbdt'
,num_leaves,114
,max_depth,11
,learning_rate,0.12135248320669911
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [62]:
# Predict probability of rain
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

# Convert probabilities to binary prediction (0 = No Rain, 1 = Rain)
y_pred = (y_pred_proba >= 0.5).astype(int)


In [63]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


AUC Score: 0.8912415845804287
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90     22672
           1       0.63      0.69      0.66      6420

    accuracy                           0.84     29092
   macro avg       0.77      0.79      0.78     29092
weighted avg       0.85      0.84      0.85     29092

Confusion Matrix:
 [[20066  2606]
 [ 1975  4445]]


### (Optional) Save the Model
If you want to use the model later:

```python
import joblib
joblib.dump(final_model, 'rain_prediction_model.pkl')
```


### (Optional) Predict on New Data
To predict rainfall for tomorrow’s conditions:

```python
# Example new sample (make sure it’s preprocessed the same way as your training set)
new_sample = pd.DataFrame([{
    'MinTemp': 10.2,
    'MaxTemp': 21.5,
    'Rainfall': 0.0,
    'Humidity9am': 80,
    'Humidity3pm': 65,
    # Add all other required features...
}])

# Apply same preprocessing as training data
# ...

# Predict rain
rain_prob = final_model.predict_proba(new_sample)[:, 1]
rain_prediction = (rain_prob >= 0.5).astype(int)

print("Will it rain tomorrow?", "Yes" if rain_prediction[0] == 1 else "No")

```

In [66]:
import joblib
joblib.dump(final_model, 'rain_prediction_model.pkl')

['rain_prediction_model.pkl']

In [67]:
print(X_train.columns.tolist())


['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
