## **Data Pre-Processing**

Import packages and dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

df = pd.read_csv("weatherAUS.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

### **Data Cleaning**

Steps:
1. Handling Missing Values and Multicollinearity Features
2. Handling Duplicates
3. Handling Outliers

#### **Handling Missing Values and Multicollinearity Features**

Based on eda notebooks, some features need to be dropped due to missing values and multicollinearity. So, let's drop the mentioned features.

In [3]:
df = df.drop(columns=["Date", "MinTemp", "MaxTemp", "Sunshine", 
                      "Evaporation", "Cloud9am", "Cloud3pm", "Pressure9am", 
                      "Pressure3pm", "Temp9am", "Temp3pm"])

In [4]:
direction_to_angle = {
    'N': 0, 'NNE': 22.5, 'NE': 45, 'ENE': 67.5,
    'E': 90, 'ESE': 112.5, 'SE': 135, 'SSE': 157.5,
    'S': 180, 'SSW': 202.5, 'SW': 225, 'WSW': 247.5,
    'W': 270, 'WNW': 292.5, 'NW': 315, 'NNW': 337.5
}

df["WindGustAngle"] = df["WindGustDir"].map(direction_to_angle)
df["WindAngle9am"] = df["WindDir9am"].map(direction_to_angle)
df["WindAngle3pm"] = df["WindDir3pm"].map(direction_to_angle)
df = df.drop(columns=["WindGustDir", "WindDir9am", "WindDir3pm"])

In [5]:
numerical_features = df.select_dtypes(include=np.number).columns.to_list()
categorical_features = df.select_dtypes(include="O").columns.to_list()

df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())
df = df.dropna(how="any")
print(df.isnull().sum())

Location         0
Rainfall         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
RainToday        0
RainTomorrow     0
WindGustAngle    0
WindAngle9am     0
WindAngle3pm     0
dtype: int64


In [6]:
df.shape

(140787, 12)

#### **Handling Duplicated Data**

In [7]:
df.duplicated().sum()

np.int64(302)

In [8]:
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

#### **Handling Outliers**

Based on eda notebooks, outlier exist on some features. So, let's detect with IQR method then handle it.

In [9]:
lower = df[numerical_features].quantile(0.25)
upper = df[numerical_features].quantile(0.75)
IQR = upper - lower
outlier = (df[numerical_features] < (lower - 1.5 * IQR)) | (df[numerical_features] > (upper + 1.5 * IQR))
print(outlier.sum())

Rainfall         25227
WindGustSpeed     5314
WindSpeed9am      1715
WindSpeed3pm      2420
Humidity9am       1414
Humidity3pm          0
WindGustAngle        0
WindAngle9am         0
WindAngle3pm         0
dtype: int64


In [10]:
df_cleaned = df[numerical_features].clip(lower=df[numerical_features].quantile(0.01), upper=df[numerical_features].quantile(0.99), axis=1)
df_cleaned

Unnamed: 0,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,WindGustAngle,WindAngle9am,WindAngle3pm
0,0.6,44.0,20.0,24.0,71.0,22.0,270.0,270.0,292.5
1,0.0,44.0,4.0,22.0,44.0,25.0,292.5,337.5,247.5
2,0.0,46.0,19.0,26.0,38.0,30.0,247.5,270.0,247.5
3,0.0,24.0,11.0,9.0,45.0,16.0,45.0,135.0,90.0
4,1.0,41.0,7.0,20.0,82.0,33.0,270.0,67.5,315.0
...,...,...,...,...,...,...,...,...,...
145454,0.0,31.0,15.0,13.0,59.0,27.0,90.0,112.5,90.0
145455,0.0,31.0,13.0,11.0,51.0,24.0,90.0,135.0,67.5
145456,0.0,22.0,13.0,9.0,56.0,21.0,337.5,135.0,0.0
145457,0.0,37.0,9.0,9.0,53.0,24.0,0.0,135.0,292.5


### Data Transformation

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

X = df.drop(columns=["RainToday", "RainTomorrow"], axis=1)
y = df["RainTomorrow"]

num_features = X.select_dtypes(include=np.number).columns.to_list()
cat_features = X.select_dtypes(include="O").columns.to_list()

numeric_transformer = StandardScaler()
one_hot_transformer = OneHotEncoder(sparse_output=False)
label_transformer = LabelEncoder()

preprocessor = ColumnTransformer(
    [
        ("StandarScaler", numeric_transformer, num_features),
        ("XOneHotEncoder", one_hot_transformer, cat_features)
    ]
)

In [12]:
X = preprocessor.fit_transform(X)

In [13]:
X

array([[-0.20711502,  0.3062123 ,  0.67728044, ...,  0.        ,
         0.        ,  0.        ],
       [-0.27792368,  0.3062123 , -1.1303694 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.27792368,  0.45835141,  0.56430233, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.27792368, -1.36731787, -0.11356637, ...,  0.        ,
         0.        ,  0.        ],
       [-0.27792368, -0.22627457, -0.56547883, ...,  0.        ,
         0.        ,  0.        ],
       [-0.27792368, -0.91090055, -0.11356637, ...,  0.        ,
         0.        ,  0.        ]], shape=(140485, 58))

In [14]:
y = label_transformer.fit_transform(y)

### Data Splitting

In [15]:
from imblearn.combine import SMOTEENN

smt = SMOTEENN(random_state=42, sampling_strategy='minority')
X_res, y_res = smt.fit_resample(X, y)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

### Baseline Model

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [18]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBoosting Classifier": XGBClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [19]:
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return acc, f1, precision, recall, roc_auc

In [37]:
def train_model(X, y, models) -> pd.DataFrame:
    """_summary_

    Args:
        X (_type_): _description_
        y (_type_): _description_
        models (_type_): _description_

    Returns:
        pd.DataFrame: _description_
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    results = []
    
    for name, model in models.items():
        time_start = time.time()
        model.fit(X_train, y_train)
        duration = time.time() - time_start
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        train_acc, train_f1, train_precision, train_recall, train_rocauc = evaluate_model(
            y_train, y_train_pred
        )
        
        test_acc, test_f1, test_precision, test_recall, test_rocauc = evaluate_model(
            y_test, y_test_pred
        )
        
        results.append({
            "Model": name,
            "Duration": duration,
            "Train Accuracy": train_acc,
            "Train F1": train_f1,
            "Train Precision": train_precision,
            "Train Recall": train_recall,
            "Train ROC-AUC": train_rocauc,
            "Test Accuracy": test_acc,
            "Test F1": test_f1,
            "Test Precision": test_precision,
            "Test Recall": test_recall,
            "Test ROC-AUC": test_rocauc,
        })
    
    report = pd.DataFrame(results).sort_values(by=["Test Accuracy"], ascending=False)
    return report

In [21]:
base_model_report = train_model(X=X_res, y=y_res, models=models)

In [22]:
base_model_report

Unnamed: 0,Model,Train Accuracy,Train F1,Train Precision,Train Recall,Train ROC-AUC,Test Accuracy,Test F1,Test Precision,Test Recall,Test ROC-AUC
4,K-Neighbors Classifier,0.981758,0.984076,0.974563,0.993778,0.97989,0.964588,0.969515,0.954658,0.984842,0.961193
0,Random Forest,1.0,1.0,1.0,1.0,1.0,0.9561,0.961688,0.959743,0.963642,0.954835
6,CatBoosting Classifier,0.964907,0.968666,0.981356,0.9563,0.966244,0.955951,0.960934,0.97476,0.947495,0.957368
5,XGBoosting Classifier,0.957543,0.96211,0.974216,0.950301,0.958668,0.946837,0.952921,0.965165,0.940983,0.947818
1,Decision Tree,1.0,1.0,1.0,1.0,1.0,0.913152,0.924106,0.923481,0.924732,0.911211
2,Gradient Boosting,0.899748,0.911166,0.915965,0.906418,0.898712,0.897605,0.909716,0.917329,0.902229,0.89683
7,Support Vector Classifier,0.901759,0.912424,0.922837,0.902243,0.901683,0.896503,0.908492,0.918677,0.898531,0.896164
8,AdaBoost Classifier,0.851566,0.867357,0.879453,0.85559,0.850941,0.849029,0.865846,0.880077,0.852068,0.84852
3,Logistic Regression,0.842184,0.859702,0.867089,0.85244,0.840591,0.840332,0.858791,0.868652,0.849151,0.838854


#### Hyperparameter Tuning

After getting top 3 baseline model, we need to adjust some parameter to get the better performance. So, we will use `RandomSearchCV` to get the best model and the best parameter from top 3 baseline model.

In [31]:
kn_params = {
    "n_neighbors": [3, 4, 5, 6, 7],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "weights": ["uniform", "distance"],
    "leaf_size": [10, 20, 30, 40, 50],
    "p": [1, 2, 3]
}
rf_params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
cb_params = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [500, 1000, 2000],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "bagging_temperature": [0, 0.5, 1, 2, 5],
    "random_strength": [0, 0.5, 1],
    "subsample": [0.6, 0.8, 1.0]
}

In [38]:
randomcv_models = [
    ("KNN", KNeighborsClassifier(), kn_params),
    ("RF", RandomForestClassifier(), rf_params),
    ("CB", CatBoostClassifier(verbose=False), cb_params)
]

In [35]:
import time
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=params,
                                       n_iter=10,
                                       cv=3,
                                       verbose=2,
                                       n_jobs=-1)
    
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_

for model_name in model_param:
    print(f"------------------ Best Params for {model_name} ------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
0:	learn: 0.5188600	total: 170ms	remaining: 5m 40s
1:	learn: 0.4296248	total: 358ms	remaining: 5m 57s
2:	learn: 0.3784327	total: 546ms	remaining: 6m 3s
3:	learn: 0.3479084	total: 712ms	remaining: 5m 55s
4:	learn: 0.3258991	total: 881ms	remaining: 5m 51s
5:	learn: 0.3074874	total: 1.04s	remaining: 5m 47s
6:	learn: 0.2941578	total: 1.21s	remaining: 5m 43s
7:	learn: 0.2815292	total: 1.36s	remaining: 5m 39s
8:	learn: 0.2721433	total: 1.53s	remaining: 5m 38s
9:	learn: 0.2637354	total: 1.69s	remaining: 5m 36s
10:	learn: 0.2545514	total: 1.85s	remaining: 5m 34s
11:	learn: 0.2462327	total: 2.01s	remaining: 5m 33s
12:	learn: 0.2408347	total: 2.17s	remaining: 5m 31s
13:	learn: 0.2339839	total: 2.34s	remaining: 5m 32s
14:	learn: 0.2294857	total: 2.52s	remaining: 5m 33s
15:	learn: 0.2240681	total: 2.7s	remaining: 5m 3

### Retraining the Model with the Best Parameters

In [39]:
best_model = {
    "K-Neighbors Classifier": KNeighborsClassifier(**model_param["KNN"]),
    "Random Forest Classifier": RandomForestClassifier(**model_param["RF"]),
    "CatBoost Classifier": CatBoostClassifier(verbose=False, **model_param["CB"])
}

tuned_report = train_model(X=X_res, y=y_res, models=best_model)
tuned_report

Unnamed: 0,Model,Duration,Train Accuracy,Train F1,Train Precision,Train Recall,Train ROC-AUC,Test Accuracy,Test F1,Test Precision,Test Recall,Test ROC-AUC
0,K-Neighbors Classifier,0.028632,1.0,1.0,1.0,1.0,1.0,0.980552,0.983178,0.97258,0.99401,0.978296
2,CatBoost Classifier,350.667674,0.999799,0.999823,1.0,0.999646,0.999823,0.961073,0.965631,0.975041,0.956402,0.961856
1,Random Forest Classifier,151.665545,1.0,1.0,1.0,1.0,1.0,0.957648,0.962978,0.962628,0.96333,0.956696


### **Best Model is K-Neighbors Classifier with 98.05% Accuracy and 0.02 seconds time taken.**