In [1]:
import os

%pwd

'/home/tousside/Documents/recrutement/cowrywise-customer-plan-abandonment/research'

os.chdir("../")

%pwd



'/home/ubuntu/africlimateai/rainfall-prediction'

In [32]:


from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    all_schema: str
    test_size: float
    look_back: int



In [33]:


from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [35]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath: str = CONFIG_FILE_PATH,
                 params_filepath: str = PARAMS_FILE_PATH,
                 schema_filepath: str = SCHEMA_FILE_PATH,
                 ):
                self.config = read_yaml(config_filepath)
                self.params = read_yaml(params_filepath)
                self.schema = read_yaml(schema_filepath)
                create_directories([self.config.artifacts_root])
                

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            all_schema=schema,
            test_size=config.test_size,
            look_back=config.look_back
            
        )
        
        return data_transformation_config

In [5]:

import os 
from mlProject import logger
import pandas as pd
import numpy as np
from copy import deepcopy as dc

In [None]:
class DataTransformation:
    def __init__(self, config:DataTransformationConfig):
        self.config = config
        
    def prepare_dataframe_for_lstm(self, df, 
                                   n_steps=7):
        df = dc(df)
        df["time"] = pd.to_datetime(df["time"])
        df.set_index("time", inplace = True)
        variables = df.columns
        for variable in variables:
            for i in range(1, n_steps +1):
                df[f"{variable}(t-{i})"]=df[variable].shift(i)
        df.dropna(inplace=True)
        return df
        
    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)
        features = list(self.config.all_schema.COLUMNS.keys())
        target = list(self.config.all_schema.TARGET_COLUMN.keys())
        all_variables = features + target
        data = data[all_variables]

        
        data_size = len(data)
        test_size = int(self.config.test_size*data_size)
        train_size = data_size-test_size
        train_set = data.iloc[:train_size].copy()
        test_set = data.iloc[train_size:].copy()


        train_set.loc[:, "is_clog"] = train_set[features].isna().all(axis=1)
        test_set.loc[:, "is_clog"] = test_set[features].isna().all(axis=1)
        
        train_set.loc[train_set["is_clog"], target[0]] = np.nan
        test_set.loc[test_set["is_clog"], target[0]] = np.nan
        
        features_no_time_clog = [f for f in features if f not in ["time", "is_clog"]]
        train_set.loc[:, features_no_time_clog + target] = train_set[features_no_time_clog + target].interpolate(method="spline", order=3) 
        test_set.loc[:, features_no_time_clog + target]  = test_set[features_no_time_clog + target].interpolate(method="spline", order=3)
 
        train_set.drop(columns=["is_clog"], inplace=True)
        test_set.drop(columns=["is_clog"], inplace=True)
        
        train_set = self.prepare_dataframe_for_lstm(df=train_set, 
                                   n_steps=self.config.look_back)
        test_set = self.prepare_dataframe_for_lstm(df=test_set, 
                                   n_steps=self.config.look_back)
        
        train_set.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test_set.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(f"Training set {train_set.shape}")
        logger.info(f"Testin set {test_set.shape}")



In [37]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2025-09-25 14:19:51,505: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-09-25 14:19:51,508: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-25 14:19:51,511: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-09-25 14:19:51,514: INFO: common: created directory at: artifacts]
[2025-09-25 14:19:51,515: INFO: common: created directory at: artifacts/data_transformation]
[2025-09-25 14:19:51,630: INFO: 255624680: Splited data into training and test sets]
[2025-09-25 14:19:51,630: INFO: 255624680: Training set (207, 96)]
[2025-09-25 14:19:51,631: INFO: 255624680: Testin set (46, 96)]
