In [1]:
import os
%pwd

'd:\\Machine_Learning\\Titanic_Pipeline_Project\\research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'd:\\Machine_Learning\\Titanic_Pipeline_Project'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from titanic.constants import *
from titanic.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH
            ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
        )
        return data_transformation_config

In [11]:
import pandas as pd
import numpy as np
from titanic.logging import logger
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def load_dataframe(self):
        df = pd.read_csv(os.path.join(self.config.data_path, "Titanic-Dataset.csv"))
        logger.info(f"Loaded {df.shape[0]} rows of data")
        return df

    def transform_data(self):
        df = self.load_dataframe()

        df = df.drop(columns=["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis=1)
        
        df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
        
        df = df.dropna(subset=["Embarked"])

        df['Age'] = df['Age'].fillna(value=int(df['Age'].mean()))

        df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

        logger.info(f"Transformed {df.shape[0]} rows of data")

        return df
    
    def save_transformed_data(self):
        df = self.transform_data()
        df.to_csv(os.path.join(self.config.root_dir, "transformed_data.csv"), index=False)
        logger.info(f"Saved transformed data to {os.path.join(self.config.data_path, 'transformed_data.csv')}")

In [14]:
try:
    config = ConfigurationManager()
    data_tranformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_tranformation_config)
    data_transformation.save_transformed_data()
    
except Exception as e:
    raise e

[2023-12-29 15:33:15,225: INFO: common: yaml file config\config.yaml loaded successfully]
[2023-12-29 15:33:15,231: INFO: common: yaml file params.yaml loaded successfully]
[2023-12-29 15:33:15,234: INFO: common: created directory at: artifacts]
[2023-12-29 15:33:15,239: INFO: common: created directory at: artifacts/data_transformation]
[2023-12-29 15:33:15,257: INFO: 809541087: Loaded 891 rows of data]
[2023-12-29 15:33:15,265: INFO: 809541087: Transformed 889 rows of data]
[2023-12-29 15:33:15,275: INFO: 809541087: Saved transformed data to artifacts/data_ingestion\transformed_data.csv]
