In [1]:
import os
os.chdir('../')
%pwd

'D:\\projects\\Project-WineQualityDataset'

In [2]:
import pandas as pd
df = pd.read_csv('artifacts/data_ingestion/WineQT.csv')
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'Id'],
      dtype='object')

In [37]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    STATUS_FILE: Path

In [38]:
from WineQuality.constants import * 
from WineQuality.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self , config_filepath = CONFIG_FILE_PATH , params_filepath = PARAMS_FILE_PATH , schema_filepath = SCHEMA_FILE_PATH,):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            STATUS_FILE = config.STATUS_FILE
        )
        return data_transformation_config
        

In [78]:
import os
from WineQuality import logger
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class DataTransformation:
    def __init__(self, config = DataTransformationConfig):
        self.config = config
        self.df = pd.read_csv(self.config.data_path)

    def drop_non_validated_columns(self):
        with open(self.config.STATUS_FILE, 'r') as status_file:
            for validation in status_file:
                if validation.split(' ')[2] == 'False':
                    column = ' '.join(validation.split(' ')[5:]).rstrip() 
                    self.df.drop(columns=[column], inplace=True)
                    logger.info(f"Dropped Column {column} as it did not pass validation")
    
    def train_test_split(self):
        scaler = StandardScaler()
        X = self.df.iloc[:,:11]
        Y = self.df.iloc[:,-1]

        X_train, X_test, Y_train, Y_test = train_test_split(X , Y, test_size=0.2, random_state=22)
        X_train.to_csv(os.path.join(self.config.root_dir,"X_train" ), index = False)
        X_test.to_csv(os.path.join(self.config.root_dir,"X_test"), index = False)
        Y_train.to_csv(os.path.join(self.config.root_dir,"Y_train"), index = False)
        Y_test.to_csv(os.path.join(self.config.root_dir,"Y_test"), index = False)

        logger.info(f"Data splitted into X_train, X_test, Y_train, Y_test")
        logger.info(f"X_train.shape: {X_train.shape}")
        logger.info(f"X_test.shape: {X_test.shape}")
        logger.info(f"Y_train.shape: {Y_train.shape}")
        logger.info(f"Y_test.shape: {Y_test.shape}")
        
        

In [79]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.drop_non_validated_columns()
    data_transformation.train_test_split()
except Exception as e:
    raise e

[2023-09-30 11:06:40,003: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-30 11:06:40,005: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-30 11:06:40,008: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-09-30 11:06:40,009: INFO: common: Directory Created: artifacts]
[2023-09-30 11:06:40,010: INFO: common: Directory Created: artifacts/data_transformation]
[2023-09-30 11:06:40,015: INFO: 387226558: Dropped Column Id as it did not pass validation]
[2023-09-30 11:06:40,037: INFO: 387226558: Data splitted into X_train, X_test, Y_train, Y_test]
[2023-09-30 11:06:40,039: INFO: 387226558: X_train.shape: (914, 11)]
[2023-09-30 11:06:40,060: INFO: 387226558: X_test.shape: (229, 11)]
[2023-09-30 11:06:40,064: INFO: 387226558: Y_train.shape: (914,)]
[2023-09-30 11:06:40,065: INFO: 387226558: Y_test.shape: (229,)]
