In [45]:
%pwd

'C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops'

In [46]:
import os
from pathlib import Path
os.chdir(Path("C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops\\"))

In [47]:
%pwd

'C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops'

In [48]:
from software_defect_prediction.constants import *
from software_defect_prediction.utils.common import *
from software_defect_prediction.entity.config_entity import DataTransformationConfig
from software_defect_prediction.config.configuration import ConfigurationManager

import shutil

In [49]:
from sys import exception
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

class Data_Transformation():
    def __init__(self,data_transformation_config : DataTransformationConfig,predictor_col : str) -> None:
        self.config = data_transformation_config
        self.predictor_col = predictor_col
        self.input_df = None
        
    def prepare_and_load_files(self) -> None:
        try :
            destination_file_path = Path(Path(self.config.root_dir) / Path(self.config.input_file_name))
            if os.path.exists(destination_file_path):
                os.remove(destination_file_path)
        
            shutil.copy(self.config.source_file_path,self.config.root_dir)
            
            input_file_path = Path(Path(self.config.root_dir) / Path(self.config.input_file_name))
            self.input_df = pd.read_csv(input_file_path)
            
            logger.info("input file loaded successfully")
        except exception as e:
            logger.error("input file loading failed")
            raise(e)
        
    def tr_test_split_and_transform(self) -> None:
        try :
            input_df = self.input_df
            predictor_col = self.predictor_col
            X = input_df.drop(columns=[predictor_col])
            y = input_df[predictor_col]
            
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y, shuffle=True)
            logger.info("train test split completed")

            robust_scaler = RobustScaler().fit(X_train.drop(columns="id"))

            X_train_scaled = pd.DataFrame(robust_scaler.transform(X_train.drop(columns="id")), columns=X_train.columns.drop('id'))
            X_test_scaled = pd.DataFrame(robust_scaler.transform(X_test.drop(columns="id")), columns=X_test.columns.drop('id'))

            X_train_scaled['id'] = X_train['id'].values
            X_test_scaled['id'] = X_test['id'].values

            train_df = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
            test_df = pd.concat([X_test_scaled, y_test.reset_index(drop=True)], axis=1)

            logger.info("train and test data scaling through robust scaler completed")
            
            joblib.dump(robust_scaler,Path(self.config.root_dir/Path("robust_scaler.joblib")))
            train_df.to_csv(Path(self.config.root_dir/Path("train_data.csv")), index=False)
            test_df.to_csv(Path(self.config.root_dir/Path("test_data.csv")), index=False)
            
            logger.info(f"robust_scaler.joblib, train_data.csv, test_data.csv are saved to ",Path(self.config.root_dir/Path("train_data.csv")))

        except exception as e :
            raise(e)

In [53]:
config_manager = ConfigurationManager()
dt_step = Data_Transformation(config_manager.get_data_transformation_config(),config_manager.get_data_schema().TARGET_COLUMN.name)
dt_step.prepare_and_load_files()
dt_step.tr_test_split_and_transform()

[32m2024-05-28 22:24:59.831[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: config\config.yaml loaded successfully[0m
[32m2024-05-28 22:24:59.838[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: params.yaml loaded successfully[0m
[32m2024-05-28 22:24:59.843[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: schema.yaml loaded successfully[0m
[32m2024-05-28 22:24:59.849[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mcreate_directories[0m:[36m51[0m - [1mcreated directory at: artifacts[0m
[32m2024-05-28 22:24:59.851[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mcreate_directories[0m:[36m51[0m - [1mcreated directory at: artifacts/data_transformation[0m
[32m2024-05-28 22:25:00.204[0m | [1mINF