In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

from pressure_brake_prediction.exception import CustomException
from pressure_brake_prediction.logger import logging
import os


In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_data_file: Path
    local_data_file: Path
    unzip_dir: Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    raw_train_data: Path
    raw_test_data:  Path
    train_data: Path
    test_data: Path
    target_column:  str
    preprocessor_obj_file_path: Path
    

In [5]:
from pressure_brake_prediction.constants import *
from pressure_brake_prediction.utils import read_yaml, create_directories, save_bin

In [6]:
class ConfigurationManager:
    def __init__(
            self, 
            config_filepath = CONFIG_FILE_PATH, 
            params_filepath = PARAM_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:

        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_data_file=config.source_data_file,
            local_data_file=config.local_data_file, 
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir, 
            raw_train_data = config.raw_train_data, 
            raw_test_data = config.raw_test_data,
            train_data = config.train_data, 
            test_data = config.train_data,
            preprocessor_obj_file_path = config.preprocessor_obj_file_path,
            target_column = config.target_column
        )

        return data_transformation_config

In [7]:
import pandas as pd 
import sys
import tqdm
from pressure_brake_prediction.exception import CustomException
import joblib


In [18]:
from pressure_brake_prediction.logger import logging
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformer_object(self):

        "This function is reponsible for data transformation."

        try:
        
            num_pipeline= Pipeline(
                    steps=[
                    ("imputer",SimpleImputer(strategy="mean")),
                    ("scaler",StandardScaler()),
                    ("pca", PCA(n_components=65, random_state=42))
                    ]
            )

            return num_pipeline
        except Exception as e:
            raise CustomException(e,sys)
    
    def initiate_transformer(self):

        try:

            train_data = pd.read_csv(self.config.raw_train_data, na_values="na")
            test_data = pd.read_csv(self.config.raw_test_data, na_values = "na")

            logging.info("Read train and test data completed")

            logging.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            X_train_data = train_data.drop(columns=[self.config.target_column], axis=0)
            y_train_data = train_data[self.config.target_column]

            X_test_data = test_data.drop(columns=[self.config.target_column], axis=0)
            y_test_data = test_data[self.config.target_column]

            logging.info("Checking for data imbalance in Train data.")

            logging.info(str(y_train_data.value_counts()))

            # data is highly balance doing oversampling for getting a balanced data

            oversample_obj = RandomOverSampler(random_state=42)

            oversampled_X_train_data, oversampled_y_train_data = oversample_obj.fit_resample(X_train_data, y_train_data)

            # data is oversampled now check if it is balanced.
            logging.info(f"Data after transformation {oversampled_y_train_data.value_counts()}")

            # initiating the pipeline for data transformatoin
            logging.info(
                    f"Applying preprocessing object on training dataframe and testing dataframe."
                )

            processed_train_data = preprocessing_obj.fit_transform(oversampled_X_train_data)
            processed_test_data = preprocessing_obj.transform(X_test_data)


            train_arr = np.c_[
                    processed_train_data, np.array(oversampled_y_train_data)]
            test_arr = np.c_[
                    processed_test_data, np.array(y_test_data)]
            
            logging.info("Saved preprocessing object.")
            save_bin(

                    path=self.config.preprocessor_obj_file_path,
                    data=preprocessing_obj

                )
            
            save_bin(

                    path=self.config.train_data,
                    data=train_arr

                )
            
            save_bin(

                    path=self.config.test_data,
                    data=test_arr

                )

            return (train_arr,
                    test_arr,
                    self.config.preprocessor_obj_file_path)
        except Exception as e:
            raise CustomException(e,sys)
        

In [19]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data4 = data_transformation.initiate_transformer()
except Exception as e:
    raise CustomException(e, sys)

[2023-04-09 12:36:35,889: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-04-09 12:36:35,903: INFO: common: yaml file: params.yaml loaded successfully]
[2023-04-09 12:36:35,906: INFO: common: created directory at: artifacts]
[2023-04-09 12:36:35,909: INFO: common: created directory at: artifacts/data_transformation]
[2023-04-09 12:36:38,039: INFO: 2379095977: Read train and test data completed]
[2023-04-09 12:36:38,040: INFO: 2379095977: Obtaining preprocessing object]
[2023-04-09 12:36:38,159: INFO: 2379095977: Checking for data imbalance in Train data.]
[2023-04-09 12:36:38,170: INFO: 2379095977: neg    59000
pos     1000
Name: class, dtype: int64]
[2023-04-09 12:36:40,178: INFO: 2379095977: Data after transformation neg    59000
pos    59000
Name: class, dtype: int64]
[2023-04-09 12:36:40,179: INFO: 2379095977: Applying preprocessing object on training dataframe and testing dataframe.]
[2023-04-09 12:36:53,635: INFO: 2379095977: Saved preprocessing object.]
[2

In [None]:
len(data4[0])