In [1]:
import os
os.chdir("../")
%pwd

'd:\\Machine_Learning\\Fetal-Health-Classification'

In [2]:
from dataclasses import dataclass 
from pathlib import Path
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    preprocessor_path: Path
    data_path: Path
    train_path: Path
    test_path: Path

In [3]:
from FetalHealthC.constants import *
from FetalHealthC.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
            self,
            config_file= CONFIG_FILE_PATH,
            params_file= PARAMS_FILE_PATH
            ):
        self.config = read_yaml(config_file)
        self.params = read_yaml(params_file)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation 

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            preprocessor_path= config.preprocessor_path,
            data_path= config.data_path,
            train_path= config.train_path,
            test_path= config.test_path
        )

        return data_transformation_config

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from FetalHealthC.logging import logger

from FetalHealthC.utils.common import save_object

class DataTransformation:
    def __init__(
            self,
            data_transformation_config: DataTransformationConfig
            ):
        self.config = data_transformation_config
    
    def save_datasets(self,train_dataset:pd.DataFrame, test_dataset:pd.DataFrame):

        train_dataset.to_csv(Path(self.config.train_path),index=False)
        test_dataset.to_csv(Path(self.config.test_path),index=False)
        logger.info(f"Datasets saved at {self.config.train_path} and {self.config.test_path}")      

    def initate_data_transformation(self):
        # Fetching the dataset
        df = pd.read_csv(os.path.join(self.config.data_path, "fetal_health.csv"))
        logger.info(f"Dataset has been fetched")

        # Spliting the dataset 
        x = df.drop(columns=['fetal_health'], axis=1)
        y = df['fetal_health']
        x_train, x_test, y_train,y_test = train_test_split(x, y, random_state=42, test_size=0.2)
        logger.info(f"Dataset splitted")

        # Preprocessing the dataset 
        std = StandardScaler()
        x_train =std.fit_transform(x_train)
        x_test = std.transform(x_test)
        logger.info(f"Dataset preprocessed")

        # Saving the preprocessor file
        save_object(Path(self.config.preprocessor_path), std)
        logger.info(f"Preprocessor saved at {self.config.preprocessor_path}")

        # Combining both the output and the input features in the numpy array
        train_arr = np.c_[x_train, np.array(y_train)]
        test_arr = np.c_[x_test, np.array(y_test)]
        logger.info(f"Numpy arrays combined")

        # Converting the numpy arrays to the pandas dataframe
        train_df = pd.DataFrame(train_arr, columns= list(df.columns))
        test_df = pd.DataFrame(test_arr, columns= list(df.columns))
        logger.info(f"Pandas Dataframe created for both train and test np arrays")

        # Saving the dataframe to respective files
        self.save_datasets(train_df, test_df)



In [6]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.initate_data_transformation()
    
except Exception as e:
    raise e

[2024-01-13 14:17:10,038: INFO: common: yaml file config\config.yaml loaded successfully]
[2024-01-13 14:17:10,049: INFO: common: yaml file params.yaml loaded successfully]
[2024-01-13 14:17:10,051: INFO: common: created directory at: artifacts]
[2024-01-13 14:17:10,053: INFO: common: created directory at: artifacts/data_transformation]
[2024-01-13 14:17:10,065: INFO: 3764565549: Dataset has been fetched]
[2024-01-13 14:17:10,076: INFO: 3764565549: Dataset splitted]
[2024-01-13 14:17:10,082: INFO: 3764565549: Dataset preprocessed]
[2024-01-13 14:17:10,090: INFO: 3764565549: Preprocessor saved at artifacts/data_transformation/preprocessor.pkl]
[2024-01-13 14:17:10,090: INFO: 3764565549: Numpy arrays combined]
[2024-01-13 14:17:10,095: INFO: 3764565549: Pandas Dataframe created for both train and test np arrays]
[2024-01-13 14:17:10,165: INFO: 3764565549: Datasets saved at artifacts/data_transformation/train.csv and artifacts/data_transformation/test.csv]


In [7]:
import numpy as np
import pandas as pd
a = np.array([[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7]])
df = pd.DataFrame(a, columns=['A','B'])
df

Unnamed: 0,A,B
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7


In [8]:
df

Unnamed: 0,A,B
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
type(train)

pandas.core.frame.DataFrame

In [11]:
test

Unnamed: 0,A,B
0,1,1
1,2,2


In [12]:
train.drop(columns=['A'], axis=1)

Unnamed: 0,B
5,6
2,3
4,5
3,4
6,7


In [13]:
train

Unnamed: 0,A,B
5,6,6
2,3,3
4,5,5
3,4,4
6,7,7
