In [2]:
import os

In [3]:
%pwd

'c:\\code\\ML\\e-commerce\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\code\\ML\\e-commerce'

In [6]:
#config.yaml


In [7]:
#entity

In [9]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class EDA:
  data_path : Path
  report : Path


In [10]:
#configuration

In [11]:
from e_commerce.utils.common import read_yaml , create_directories 
from e_commerce.constants import *
from e_commerce.entity.config_entity import DataIngestionconfig,PreProcessing,FeatureEngineeringconfig

class ConfigurationManager:
    def __init__(self,config_file_path=CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionconfig:
        config = self.config.data_ingestion
        data_ingestion_config = DataIngestionconfig(root_dir=config.root_dir,
                                                    source_url=config.source_url,
                                                    local_data_file=config.local_data_file)
        return data_ingestion_config
    
    def get_preprocessing_config(self)->PreProcessing:
        config = self.config.pre_processing
        create_directories([config.cleaned_data_save_path])
        create_directories([config.data_report])
        pre_processing_config = PreProcessing(data_path=config.data_path,cleaned_data_save_path=config.cleaned_data_save_path,data_report=config.data_report
                                              )
        return pre_processing_config
    
    def get_feature_engg_config(self)->FeatureEngineeringconfig:
        config = self.config.feature_engg
        create_directories([config.output_path])
        feature_engg_config = FeatureEngineeringconfig(data_path=config.data_path,output_path=config.output_path)
        return feature_engg_config 
    

    def get_eda_config(self)->EDA:
        config = self.config.EDA
        create_directories([config.report])
        eda_config = EDA(data_path=config.data_path,report=config.report)

        return eda_config

In [12]:
#component

In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



class EDAComponent:
    def __init__(self,config : EDA):
        self.config = config

    def load_data(self):
        if not os.path.exists(self.config.data_path):
            raise FileNotFoundError(f"Input file not found: {self.config.data_path}")
        return pd.read_csv(self.config.data_path)

    def get_EDA(self):
        rmf_df = self.load_data()

        rmf = rmf_df[['recency', 'frequency', 'monetary']]

        # Histograms
        plt.figure(figsize=(12,10))
        rmf.hist(bins=10, figsize=(12, 6))
        plt.suptitle('Distributions of Recency, Frequency, and Monetary', fontsize=16)
        plt.savefig(os.path.join(self.config.report,"histograms.png"))
        plt.clf()

        # Boxplots to spot outliers
        plt.figure(figsize=(12, 5))
        for i, col in enumerate(rmf.columns):
            plt.subplot(1, 3, i+1)
            sns.boxplot(y=rmf[col])
            plt.title(f'Boxplot of {col}')
        plt.tight_layout()
        plt.savefig(os.path.join(self.config.report,"boxplots.png"))
        plt.clf()


        # correlation
        plt.figure(figsize=(6,4))
        sns.heatmap(rmf.corr(),annot=True,cmap='coolwarm')
        plt.title('RFM Feature Correlation')
        plt.savefig(os.path.join(self.config.report,"correlation.png"))
        plt.clf()


        # pairplots
        
        sns.pairplot(rmf)
        plt.suptitle('Pairplot of RFM Features', y=1.02)
        plt.savefig(os.path.join(self.config.report,"pairplot.png"))
        plt.clf()


In [14]:
!pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting matplotlib!=3.6.1,>=3.4
  Downloading matplotlib-3.10.3-cp310-cp310-win_amd64.whl (8.1 MB)
     ---------------------------------------- 8.1/8.1 MB 37.3 kB/s eta 0:00:00
Collecting fonttools>=4.22.0
  Downloading fonttools-4.58.1-cp310-cp310-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 31.5 kB/s eta 0:00:00
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting pyparsing>=2.3.1
  Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)
Collecting pillow>=8
  Using cached pillow-11.2.1-cp310-cp310-win_amd64.whl (2.7 MB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.8-cp310-cp310-win_amd64.whl (71 kB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.2-cp310-cp310-win_amd64.whl (221 kB)
Installing collected packages: pyparsing, pillow, kiwisolver, fonttools, cycler, contourpy, matplotlib, seaborn
Successfully install


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
!pip install matplotlib




[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
#pipeline

In [18]:
try :
    config = ConfigurationManager()
    eda_config = config.get_eda_config()
    eda_comp = EDAComponent(config=eda_config)
    eda_comp.get_EDA()

except Exception as e:
    raise e

[2025-05-28 22:32:13,972: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-28 22:32:13,988: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-28 22:32:13,988: INFO: common: created directory at: artifacts]
[2025-05-28 22:32:14,003: INFO: common: created directory at: artifacts/EDA]


<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x500 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 750x750 with 0 Axes>