In [1]:
import os

In [2]:
%pwd

'c:\\code\\ML\\e-commerce\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\code\\ML\\e-commerce'

In [7]:
import pandas as pd
from pathlib import Path

In [6]:
#config.yaml


In [17]:
# entity 
from dataclasses import dataclass
@dataclass 
class PreProcessing:
    data_path : Path
    data_report : Path
    cleaned_data_save_path :Path

In [11]:
# configuration


In [19]:
from e_commerce.utils.common import read_yaml , create_directories 
from e_commerce.constants import *
from e_commerce.entity.config_entity import DataIngestionconfig


class ConfigurationManager:
    def __init__(self,config_file_path=CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionconfig:
        config = self.config.data_ingestion
        data_ingestion_config = DataIngestionconfig(root_dir=config.root_dir,
                                                    source_url=config.source_url,
                                                    local_data_file=config.local_data_file)
        return data_ingestion_config
    
    def get_preprocessing_config(self)->PreProcessing:
        config = self.config.pre_processing
        create_directories([config.cleaned_data_save_path])
        create_directories([config.data_report])
        pre_processing_config = PreProcessing(data_path=config.data_path,cleaned_data_save_path=config.cleaned_data_save_path,data_report=config.data_report
                                              )
        return pre_processing_config
    

    


In [16]:
#components

In [20]:
import pandas as pd
from e_commerce import logger
class PreProcessingComponent:
    def __init__(self,config:PreProcessing):
        self.config = config

    def load_data(self):
        if not os.path.exists(self.config.data_path):
            raise FileNotFoundError(f"Input file not found: {self.config.data_path}")
        return pd.read_excel(self.config.data_path)
    def data_cleaning(self):
        df = self.load_data()
        with open(os.path.join(self.config.data_report,'pre_info.txt'),'w') as f:
            f.write(str(df.isna().sum()))
        logger.info(f"check {self.config.data_report} for data reports")
        
        # data cleaning 
        df = df.dropna()
        df = df.dropna(subset=['CustomerID'])
        df = df.drop(['Description','Country'],axis = 1 )
        # Convert CustomerID to string (for grouping)
        df['CustomerID'] = df['CustomerID'].astype(dtype=str)
        df = df.reset_index(drop=True)
        logger.info('data cleaning completed')

        cleaned_file_path = os.path.join(self.config.cleaned_data_save_path,"pre_processed.csv")
        df.to_csv(cleaned_file_path, index=False)
        logger.info(f"Cleaned data saved to: {cleaned_file_path}")


In [21]:
#pipeline
try :
    config = ConfigurationManager()
    data_preprocessing_config = config.get_preprocessing_config()
    pre_process_component = PreProcessingComponent(config=data_preprocessing_config)
    pre_process_component.data_cleaning()
except Exception as e:
    raise e



[2025-05-27 21:09:42,067: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-27 21:09:42,187: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-27 21:09:42,202: INFO: common: created directory at: artifacts]
[2025-05-27 21:09:42,213: INFO: common: created directory at: artifacts/cleaned_data]
[2025-05-27 21:09:42,220: INFO: common: created directory at: artifacts/data_report]
[2025-05-27 21:11:08,759: INFO: 2842944345: check artifacts/data_report for data reports]
[2025-05-27 21:11:09,742: INFO: 2842944345: data cleaning completed]
[2025-05-27 21:11:13,808: INFO: 2842944345: Cleaned data saved to: artifacts/cleaned_data\pre_processed.csv]
