In [17]:
import os

In [18]:
%pwd
# os.chdir('../')
%pwd

'/workspaces/mlproject_wine_quality'

In [19]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    scaler_path: Path


In [20]:
from wine_quality_predictor.constants import *
from wine_quality_predictor.utils.common import read_yaml, make_directory

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH
    ):
        self.config_filepath = config_filepath
        self.params_filepath = params_filepath
        self.schema_filepath = schema_filepath

        self.config = read_yaml(Path(self.config_filepath))
        self.params = read_yaml(Path(self.params_filepath))
        self.schema = read_yaml(Path(self.schema_filepath))

        make_directory(Path(self.config.artifacts_root))
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            scaler_path=Path(config.scaler_path)
        )


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from wine_quality_predictor.entity.config_entity import DataTransformationConfig
from wine_quality_predictor.utils.common import make_directory, save_bin
from wine_quality_predictor import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform_and_split(self):
        logger.info("Reading dataset for transformation and splitting...")
        df = pd.read_csv(self.config.data_path)
        # df = pd.read_csv(self.config.data_path , delimiter=";",quotechar='"')

        # if df.isnull().sum().sum() > 0:
        #     logger.warning("Missing values found. Filling with mean...")
        #     df = df.fillna(df.mean())

        logger.info("Splitting data into train and test...")
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
        

        # X_train = train_df.drop("quality", axis=1)
        # y_train = train_df["quality"]
        # X_test = test_df.drop("quality", axis=1)
        # y_test = test_df["quality"]

        # logger.info("Fitting scaler on training data and transforming...")
        # scaler = StandardScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_test_scaled = scaler.transform(X_test)
        # # X_train_scaled = X_train
        # # X_test_scaled = X_test

        # train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
        # train_scaled["quality"] = y_train.reset_index(drop=True)

        # test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
        # test_scaled["quality"] = y_test.reset_index(drop=True)

        logger.info("Creating output directories and saving files...")
        make_directory(self.config.root_dir)

        train_path = self.config.root_dir / "train.csv"
        test_path = self.config.root_dir / "test.csv"

        train_df.to_csv(train_path, index=False)
        train_df.to_csv(test_path, index=False)
        # train_scaled.to_csv(train_path, index=False)
        # test_scaled.to_csv(test_path, index=False)

        # save_bin(self.config.scaler_path, scaler)

        logger.info(f"Train and test data saved to {self.config.root_dir}")
        logger.info(f"Scaler saved at {self.config.scaler_path}")


In [22]:
# from src.<your_project>.config.configuration import ConfigurationManager
# from src.<your_project>.components.data_transformation import DataTransformation
from wine_quality_predictor import logger

STAGE_NAME = "Data Transformation"

def main():
    try:
        logger.info(f">>>>>> Stage {STAGE_NAME} started <<<<<<")
        config = ConfigurationManager().get_data_transformation_config()
        transformer = DataTransformation(config)
        transformer.transform_and_split()
        logger.info(f">>>>>> Stage {STAGE_NAME} completed <<<<<<\n")
    except Exception as e:
        logger.exception(f"Error in stage {STAGE_NAME}: {e}")
        raise e


In [23]:
main()

[2025-04-13 07:57:27,437] INFO - 2021950657 - >>>>>> Stage Data Transformation started <<<<<<
[2025-04-13 07:57:27,443] INFO - common - Loaded YAML file from: config/config.yaml
[2025-04-13 07:57:27,445] INFO - common - Loaded YAML file from: params.yaml
[2025-04-13 07:57:27,448] INFO - common - Loaded YAML file from: schema.yaml
[2025-04-13 07:57:27,450] INFO - common - Created directory: artifacts
[2025-04-13 07:57:27,452] INFO - 2047129445 - Reading dataset for transformation and splitting...
[2025-04-13 07:57:27,456] INFO - 2047129445 - Splitting data into train and test...
[2025-04-13 07:57:27,458] INFO - 2047129445 - Creating output directories and saving files...
[2025-04-13 07:57:27,459] INFO - common - Created directory: artifacts/data_transformation
[2025-04-13 07:57:27,485] INFO - 2047129445 - Train and test data saved to artifacts/data_transformation
[2025-04-13 07:57:27,486] INFO - 2047129445 - Scaler saved at artifacts/data_transformation/scaler.pkl
[2025-04-13 07:57:27,4

In [24]:
train_data_path= "artifacts/data_transformation/train.csv"
test_data_path= "artifacts/data_transformation/test.csv"
train_df = pd.read_csv(str(train_data_path))
test_df = pd.read_csv(str(test_data_path))
X_train = train_df.drop("quality", axis=1)
y_train = train_df["quality"]
X_test = test_df.drop("quality", axis=1)
y_test = test_df["quality"]
print(y_train.isna().sum())
y_train

0


0      5
1      6
2      5
3      6
4      6
      ..
912    5
913    6
914    6
915    6
916    5
Name: quality, Length: 917, dtype: int64