In [1]:
import os

%pwd

'/home/tousside/Documents/recrutement/cowrywise-customer-plan-abandonment/research'

os.chdir("../")

%pwd



'/home/ubuntu/africlimateai/rainfall-prediction'

In [2]:
import pandas as pd

data = pd.read_csv("artifacts/data_transformation/train.csv", index_col="time")
data_test = pd.read_csv("artifacts/data_transformation/test.csv", index_col="time")

data.info()



<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, 2024-11-27 to 2025-06-21
Data columns (total 89 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   temperature             207 non-null    float64
 1   feels_like              207 non-null    float64
 2   app_temp                207 non-null    float64
 3   dew_point               207 non-null    float64
 4   humidity                207 non-null    float64
 5   wind_direction          207 non-null    float64
 6   wind_speed              207 non-null    float64
 7   wind_gust               207 non-null    float64
 8   pressure_relative       207 non-null    float64
 9   pressure_absolute       207 non-null    float64
 10  rainfall                207 non-null    float64
 11  temperature(t-1)        207 non-null    float64
 12  temperature(t-2)        207 non-null    float64
 13  temperature(t-3)        207 non-null    float64
 14  temperature(t-4)        207 non

In [17]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    scaler_path: Path
    classifier_path: Path
    regressor_path: Path
    classifier_params: dict
    regressor_params: dict
    metric_file_name: str
    classification_target_column: str
    regression_target_column: str
    mlflow_uri: str
    batch_size: int


In [18]:
import os
import pandas as pd
import mlflow
import mlflow.sklearn
import numpy as np
import joblib
from urllib.parse import urlparse

In [19]:


from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories, save_json
from mlProject import logger


In [20]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params
        schema =  self.schema

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            scaler_path = config.scaler_path,
            classifier_path = config.classifier_path,
            regressor_path = config.regressor_path,
            classifier_params = params.classification,
            regressor_params = params.regression,
            metric_file_name = config.metric_file_name,
            classification_target_column = schema.TARGET_COLUMN_CLASSIFICATION,
            regression_target_column = schema.TARGET_COLUMN,
            mlflow_uri=config.mlflow_uri,
            batch_size=config.batch_size
           
        )

        return model_evaluation_config



In [63]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch import optim
from torch.utils.data import DataLoader
from copy import deepcopy as dc
from sklearn.metrics import root_mean_squared_error


In [64]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    class TimeSeriesDataset(Dataset):
        def __init__(self, X, y):
            self.X = X
            self.y = y

        def __len__(self):
            return len(self.X)
        def     __getitem__(self, i):
            return self.X[i], self.y[i]
        


    def log_into_mlflow(self):
        
        mlflow.set_registry_uri(self.config.mlflow_uri)
        mlflow.set_experiment("rainfall-prediction")
        with mlflow.start_run():
            test_data = pd.read_csv(self.config.test_data_path, index_col="time")
            
            classification_target_column = list(self.config.classification_target_column.keys())[0]
            regression_target_column = list(self.config.regression_target_column.keys())[0]
            targets_columns = [classification_target_column, regression_target_column]

            # Features
            X_test = test_data.drop(columns=targets_columns).values

            # Targets
            y_test = test_data[regression_target_column].values.reshape(-1, 1)

            # Load classifier + scaler
            scaler = joblib.load(self.config.scaler_path)
            classifier = joblib.load(self.config.classifier_path)

            # Scale features
            X_test_scaled = scaler.transform(X_test)
            classifier_preds = classifier.predict(X_test_scaled)

            # Prepare LSTM input
            X_test_lstm = dc(np.flip(X_test_scaled, axis=1))
            vars_dim = X_test_lstm.shape[1]
            X_test_lstm = X_test_lstm.reshape((-1, vars_dim, 1))

            # Torch tensors
            X_test_tensor = torch.tensor(X_test_lstm).float()
            y_test_tensor = torch.tensor(y_test).float()

            test_dataset = self.TimeSeriesDataset(X_test_tensor, y_test_tensor)
            batch_size = self.config.batch_size
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Device
            device = "cuda" if torch.cuda.is_available() else "cpu"

            # Load regression model
            model = torch.load(self.config.regressor_path, weights_only=False)
            model.to(device)
            model.eval()

            predictions = []
            with torch.no_grad():
                for i, (X_batch, _) in enumerate(test_loader):
                    X_batch = X_batch.to(device)
                    
                    # If classifier said 0 â†’ output 0
                    clf_batch_preds = classifier_preds[i * batch_size : (i + 1) * batch_size]
                    batch_preds = []
                    if len(clf_batch_preds) < len(X_batch):
                        # Handle last smaller batch
                        X_batch = X_batch[:len(clf_batch_preds)]
                    
                    for j, clf_pred in enumerate(clf_batch_preds):
                        if clf_pred == 0:
                            batch_preds.append(0.0)
                        else:
                            reg_out = model(X_batch[j].unsqueeze(0))
                            batch_preds.append(reg_out[0].item())
                    
                    predictions.extend(batch_preds)

            predictions = np.array(predictions).reshape(-1, 1)

            scaler_path = self.config.scaler_path
            regressor_path = self.config.regressor_path
            classifier_path = self.config.classifier_path


            rmse = root_mean_squared_error(y_test, predictions)
            mlflow.log_params(self.config.classifier_params)
            mlflow.log_params(self.config.regressor_params)
            mlflow.log_artifact(scaler_path, artifact_path="scaler")
            mlflow.log_artifact(classifier_path, artifact_path="classifier")
            mlflow.log_artifact(regressor_path, artifact_path="regressor")
            mlflow.log_metric('rmse', rmse)
            

            logger.info(f"Final RMSE: {rmse}")
        return predictions, rmse
        
        
        
        

        # mlflow.set_registry_uri(self.config.mlflow_uri)
        # tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # with mlflow.start_run():


        #     (accuracy, roc_auc) = self.eval_metrics(model,  X_test, y_test)
            
        #     # Saving metrics as local
        #     scores = {"accuracy": accuracy, "roc_auc": roc_auc}
        #     save_json(path=Path(self.config.metric_file_name), data=scores)

        #     mlflow.log_params(self.config.all_params)

        #     mlflow.log_metric("accuracy", accuracy)
        #     mlflow.log_metric("roc_auc", roc_auc)


        #     # Model registry does not work with file store
        #     if tracking_url_type_store != "file":

        #         # Register the model
        #         # There are other ways to use the Model Registry, which depends on the use case,
        #         # please refer to the doc for more information:
        #         # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        #         mlflow.sklearn.log_model(model, "model", registered_model_name="XGBoostModel")
        #     else:
        #         mlflow.sklearn.log_model(model, "model")

In [65]:


try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    prediction =  model_evaluation_config.log_into_mlflow()
except Exception as e:
    raise e



[2025-09-29 13:40:30,959: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-09-29 13:40:30,960: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-29 13:40:30,964: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-09-29 13:40:30,966: INFO: common: created directory at: artifacts]
[2025-09-29 13:40:30,967: INFO: common: created directory at: artifacts/model_evaluation]
[2025-09-29 13:40:31,017: INFO: 2125160172: Final RMSE: 6.36552784521722]
