## Import libraries

In [1]:
# ==========================
# Data handling
# ==========================
import numpy as np
import pandas as pd

# ==========================
# Model saving/loading
# ==========================
import pickle

# ==========================
# Classification models
# ==========================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ==========================
# Model selection and tuning
# ==========================
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

# ==========================
# Data preprocessing
# ==========================
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

# ==========================
# Evaluation metrics
# ==========================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ==========================
# Miscellaneous 
# ==========================
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
from time import time

In [2]:
df = pd.read_csv('../data/seattleWeather_1948-2017.csv')
df

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True
...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False
25547,2017-12-11,0.00,49,29,False
25548,2017-12-12,0.00,46,32,False
25549,2017-12-13,0.00,48,34,False


| Variable | Description                                                                 |
|----------|-----------------------------------------------------------------------------|
| DATE     | The date of the observation                                                 |
| PRCP     | The amount of precipitation, in inches                                      |
| TMAX     | The maximum temperature for that day, in degrees Fahrenheit                 |
| TMIN     | The minimum temperature for that day, in degrees Fahrenheit                 |
| RAIN     | TRUE if rain was observed on that day, FALSE if it was not                  |


### Transform units from the imperial metric system to the international system of units (Fahrenheit --> Celsius, Inches --> millimeters)

In [3]:
df['TMAX'] = (df['TMAX'] - 32) * 5/9
df['TMIN'] = (df['TMIN'] - 32) * 5/9
df['PRCP'] = df['PRCP'] * 25.4

In [4]:
df

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,11.938,10.555556,5.555556,True
1,1948-01-02,14.986,7.222222,2.222222,True
2,1948-01-03,10.668,7.222222,1.666667,True
3,1948-01-04,7.874,7.222222,1.111111,True
4,1948-01-05,4.318,7.222222,0.000000,True
...,...,...,...,...,...
25546,2017-12-10,0.000,9.444444,1.111111,False
25547,2017-12-11,0.000,9.444444,-1.666667,False
25548,2017-12-12,0.000,7.777778,0.000000,False
25549,2017-12-13,0.000,8.888889,1.111111,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25551 entries, 0 to 25550
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DATE    25551 non-null  object 
 1   PRCP    25548 non-null  float64
 2   TMAX    25551 non-null  float64
 3   TMIN    25551 non-null  float64
 4   RAIN    25548 non-null  object 
dtypes: float64(3), object(2)
memory usage: 998.2+ KB


In [6]:
df.dropna(inplace=True)

In [7]:
label_encoder = preprocessing.LabelEncoder()
df["RAIN"] = label_encoder.fit_transform(df["RAIN"])
df["RAIN"].value_counts()

RAIN
0    14648
1    10900
Name: count, dtype: int64

In [8]:
# Converting object into datetime to extract day, month and year

df["DATE"]=pd.to_datetime(df["DATE"], format= "%Y-%m-%d")

# Extract day, month and year
df["DAY"]=df["DATE"].dt.day
df["MONTH"]=df["DATE"].dt.month
df["YEAR"]=df["DATE"].dt.year
df=df.drop(["DATE"], axis=1)

#Rearrange columns
df=df[["DAY", "MONTH", "YEAR", "TMAX", "TMIN", "RAIN"]]
df.head()

Unnamed: 0,DAY,MONTH,YEAR,TMAX,TMIN,RAIN
0,1,1,1948,10.555556,5.555556,1
1,2,1,1948,7.222222,2.222222,1
2,3,1,1948,7.222222,1.666667,1
3,4,1,1948,7.222222,1.111111,1
4,5,1,1948,7.222222,0.0,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25548 entries, 0 to 25550
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DAY     25548 non-null  int32  
 1   MONTH   25548 non-null  int32  
 2   YEAR    25548 non-null  int32  
 3   TMAX    25548 non-null  float64
 4   TMIN    25548 non-null  float64
 5   RAIN    25548 non-null  int32  
dtypes: float64(2), int32(4)
memory usage: 998.0 KB


## Train / Test Split

In [10]:
X = df.drop(columns='RAIN')
y = df['RAIN']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=df['RAIN'], test_size=0.2, random_state=42, shuffle=True)

In [12]:
target = 'RAIN'

### Models baseline

In [13]:
models = {
    'Logistic Regression': LogisticRegression(penalty='l2', C=1.0, max_iter=1000, solver='liblinear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, learning_rate=0.8, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [14]:
def cross_validate_models_classification(models: dict, X_train, y_train) -> pd.DataFrame:

    model_names = []  # To store model names
    accuracy = []     # To store accuracy results
    f1_score = []     # To store F1 score results
    precision = []    # To store precision results
    recall = []       # To store recall results
    pipes = {}        # To store the model pipelines

    # Initialize the scaler
    scaler = StandardScaler()

    for name, alg in models.items():

        # Create a pipeline with the scaler and classifier
        pipe = Pipeline(steps=[
            ('scaler', scaler),  # Scaler step
            ('classifier', alg)  # Classifier step
        ])

        # Perform cross-validation with classification metrics
        CVresults = cross_validate(pipe, X_train, y_train, 
                                   scoring=('accuracy', 'f1', 'precision', 'recall'),
                                   cv=5)  # Using 5-fold cross-validation

        # Save results for each model
        model_names.append(name)
        accuracy.append(np.mean(CVresults['test_accuracy']))
        f1_score.append(np.mean(CVresults['test_f1']))
        precision.append(np.mean(CVresults['test_precision']))
        recall.append(np.mean(CVresults['test_recall']))
        pipes[name] = pipe

    # Create a DataFrame with the results
    cvResultsDF = pd.DataFrame({
        'Model': model_names,
        'Accuracy': accuracy,
        'F1 Score': f1_score,
        'Precision': precision,
        'Recall': recall
    })

    return cvResultsDF, pipes


In [15]:
cv_results, pipes = cross_validate_models_classification(models, X_train, y_train)

In [16]:
cv_results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.756727,0.715624,0.713906,0.717661
1,Random Forest,0.767688,0.734696,0.716561,0.753899
2,Gradient Boosting,0.767932,0.73621,0.714725,0.759174
3,AdaBoost,0.765486,0.730229,0.716992,0.744037
4,XGBoost,0.767687,0.7352,0.715644,0.755963


## Hyperparameter optimization 

In [17]:
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__max_depth': [3, 5]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__max_depth': [3, 5, 7]
    }
}

In [18]:
def hyperparameter_tuning_classification(pipelines: dict, param_grids: dict, X_train, y_train, cv_scoring: str) -> dict:
    tuned_models = {}

    for name, pipe in pipelines.items():

        params = param_grids.get(name)
        if params:
            print(f'Tuning {name} hyperparameters...')

            gs = GridSearchCV(pipe, param_grid=params, cv=5, scoring=cv_scoring, n_jobs=-1, verbose=1)

            start = time()
            gs.fit(X_train, y_train)
            end = time()

            tuning_time = end - start
            time_message = f'Tuning {name} took: {tuning_time:.3f} seconds' if tuning_time < 60 else f'Tuning {name} took: {tuning_time/60:.3f} minutes'
            best = gs.best_estimator_

            print(f'---- Hyperparameter tuning complete ----')
            print(time_message)
            score = -gs.best_score_ if 'neg_' in cv_scoring else gs.best_score_
            print(f'Best Score: {score:.5f}')
            print(f'Best parameters:\n{gs.best_params_}\n')

        else:
            print(f'No parameter grid found for {name}. Fitting model directly...')

            start = time()
            cv = cross_validate(pipe, X_train, y_train, scoring=cv_scoring)
            pipe.fit(X_train, y_train)
            best = pipe
            end = time()

            tuning_time = end - start
            time_message = f'Fitting {name} took: {tuning_time:.3f} seconds' if tuning_time < 60 else f'Fitting {name} took: {tuning_time/60:.3f} minutes'
            print(time_message)

            mean_score = -np.mean(cv['test_score']) if 'neg_' in cv_scoring else np.mean(cv['test_score'])
            print(f'Score from CV: {mean_score:.5f}\n')
        
        tuned_models[name] = best

    return tuned_models


In [19]:
tuned_models = hyperparameter_tuning_classification(pipes, param_grids, X_train, y_train, 'accuracy')

Tuning Logistic Regression hyperparameters...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
---- Hyperparameter tuning complete ----
Tuning Logistic Regression took: 2.509 seconds
Best Score: 0.75687
Best parameters:
{'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}

Tuning Random Forest hyperparameters...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
---- Hyperparameter tuning complete ----
Tuning Random Forest took: 13.594 seconds
Best Score: 0.76896
Best parameters:
{'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}

Tuning Gradient Boosting hyperparameters...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
---- Hyperparameter tuning complete ----
Tuning Gradient Boosting took: 10.023 seconds
Best Score: 0.77033
Best parameters:
{'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}

Tuning AdaBoost hyperparameters...
Fitti

## Test evaluation 

In [20]:
def test_evaluation(tuned_models: dict, X_train, y_train, X_test, y_test) -> pd.DataFrame:
    model_names = []
    best_scores = []
    accuracy = []
    precision = []
    recall = []
    f1 = []

    if not isinstance(tuned_models, dict):
        tuned_models = {f'{tuned_models.steps[-1][1].__class__.__name__}': tuned_models}

    for name, model in tuned_models.items():
        print(f"Evaluating model: {name}...")

        # Fitting the model on training data
        model.fit(X_train, y_train)

        # Predicting on test data
        y_preds = model.predict(X_test)

        # Calculating classification metrics
        accuracy_score_value = accuracy_score(y_test, y_preds)
        precision_score_value = precision_score(y_test, y_preds, average='weighted')
        recall_score_value = recall_score(y_test, y_preds, average='weighted')
        f1_score_value = f1_score(y_test, y_preds, average='weighted')

        # Storing results
        model_names.append(name)
        accuracy.append(accuracy_score_value)
        precision.append(precision_score_value)
        recall.append(recall_score_value)
        f1.append(f1_score_value)

        # Calculating best score based on accuracy (you can change this to f1_score_value or another metric)
        best_score = accuracy_score_value
        best_scores.append(best_score)

        print(f'📊 Model: {name}')
        print(f' Best Score (based on accuracy): {best_score:.5f}')
        print(f' Accuracy: {accuracy_score_value:.5f} | Precision: {precision_score_value:.5f} | Recall: {recall_score_value:.5f} | F1-Score: {f1_score_value:.5f}\n')

    # Creating the DataFrame to show the results
    evaluation_results = pd.DataFrame({
        'Model': model_names,
        'Best Score': best_scores,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })

    return evaluation_results

In [21]:
evaluation_results = test_evaluation(tuned_models, X_train, y_train, X_test, y_test)

Evaluating model: Logistic Regression...
📊 Model: Logistic Regression
 Best Score (based on accuracy): 0.75656
 Accuracy: 0.75656 | Precision: 0.75661 | Recall: 0.75656 | F1-Score: 0.75658

Evaluating model: Random Forest...
📊 Model: Random Forest
 Best Score (based on accuracy): 0.77260
 Accuracy: 0.77260 | Precision: 0.77414 | Recall: 0.77260 | F1-Score: 0.77313

Evaluating model: Gradient Boosting...
📊 Model: Gradient Boosting
 Best Score (based on accuracy): 0.77671
 Accuracy: 0.77671 | Precision: 0.77924 | Recall: 0.77671 | F1-Score: 0.77743

Evaluating model: AdaBoost...
📊 Model: AdaBoost
 Best Score (based on accuracy): 0.77299
 Accuracy: 0.77299 | Precision: 0.77340 | Recall: 0.77299 | F1-Score: 0.77317

Evaluating model: XGBoost...
📊 Model: XGBoost
 Best Score (based on accuracy): 0.77417
 Accuracy: 0.77417 | Precision: 0.77570 | Recall: 0.77417 | F1-Score: 0.77469



In [22]:
evaluation_results

Unnamed: 0,Model,Best Score,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.756556,0.756556,0.756614,0.756556,0.756584
1,Random Forest,0.772603,0.772603,0.774136,0.772603,0.773129
2,Gradient Boosting,0.776712,0.776712,0.779239,0.776712,0.777433
3,AdaBoost,0.772994,0.772994,0.7734,0.772994,0.773173
4,XGBoost,0.774168,0.774168,0.775695,0.774168,0.774691


## Save the model pipelines to disk using pickle

In [23]:
def save_best_model(test_resultsDF: pd.DataFrame, tuned_models: dict, selection_metric='', save_path='../Models/best_model.pkl'):
    best_model = tuned_models[test_resultsDF.loc[test_resultsDF[f'{selection_metric}'].idxmax(), 'Model']]
    pickle.dump(best_model, open(save_path, 'wb'))


In [24]:
save_best_model(evaluation_results, tuned_models, selection_metric='Recall')