In [2]:
import os 


In [3]:
%pwd

'c:\\code\\ML\\e-commerce\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\code\\ML\\e-commerce'

In [6]:
#config.yaml

In [7]:
#config entity

In [12]:
from e_commerce.utils.common import read_yaml , create_directories 
from e_commerce.constants import *
from e_commerce.entity.config_entity import DataIngestionconfig,PreProcessing,FeatureEngineeringconfig,EDA,outlier,cluster

class ConfigurationManager:
    def __init__(self,config_file_path=CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionconfig:
        config = self.config.data_ingestion
        data_ingestion_config = DataIngestionconfig(root_dir=config.root_dir,
                                                    source_url=config.source_url,
                                                    local_data_file=config.local_data_file)
        return data_ingestion_config
    
    def get_preprocessing_config(self)->PreProcessing:
        config = self.config.pre_processing
        create_directories([config.cleaned_data_save_path])
        create_directories([config.data_report])
        pre_processing_config = PreProcessing(data_path=config.data_path,cleaned_data_save_path=config.cleaned_data_save_path,data_report=config.data_report
                                              )
        return pre_processing_config
    
    def get_feature_engg_config(self)->FeatureEngineeringconfig:
        config = self.config.feature_engg
        create_directories([config.output_path])
        feature_engg_config = FeatureEngineeringconfig(data_path=config.data_path,output_path=config.output_path)
        return feature_engg_config 
    
    def get_eda_config(self)->EDA:
        config = self.config.EDA
        create_directories([config.report])
        eda_config = EDA(data_path=config.data_path,report=config.report)

        return eda_config
    
    def outlier_config(self)->outlier:
        config = self.config.outlier
        create_directories([config.output_path])
        create_directories([config.report])
        outlier_config = outlier(data_path=config.data_path,output_path=config.output_path,report=config.report)
        return outlier_config
    
    def cluster_config(self)->cluster:
        config = self.config.cluster
        params = self.params.cluster
        create_directories([config.models])
        create_directories([config.report])
        model_building_config = cluster(data_path=config.data_path,models=config.models,report=config.report,random_state= params.random_state)
        return model_building_config

In [13]:
#component

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.cluster import KMeans
from kneed import KneeLocator
class ClusterComponent:
    def __init__(self,config : cluster):
        self.config = config
        
    def load_data(self):
        if not os.path.exists(self.config.data_path):
            raise FileNotFoundError(f"Input file not found: {self.config.data_path}")
        return pd.read_csv(self.config.data_path)

    def get_cluster(self):
        df = self.load_data()
        rmf = df[['recency','frequency','monetary']]
        # scaling
        scaler = MinMaxScaler()
        rmf_scaled = scaler.fit_transform(rmf)
        rmf_scaled = pd.DataFrame(data=rmf_scaled, columns=['recency', 'frequency', 'monetary'])
        # rmf_scaled.to_csv(self.config.models)
        joblib.dump(scaler,os.path.join(self.config.models,'scaler.pkl'))

        # elbow method
        sse = []
        k_range = range(1,10)
        for k in k_range:
            km = KMeans(n_clusters=k,random_state=self.config.random_state)
            km.fit(rmf_scaled[['recency','monetary']])
            sse.append(km.inertia_)
        with open(os.path.join(self.config.report,"sse_info.txt"),'w') as f:
            for i, val in enumerate(sse, start=1):
                f.write(f"k={i}, SSE={val}\n")
            

        # knee locator 
        kl = KneeLocator(
            x= k_range,
            y=sse,
            curve='convex',
            direction='decreasing'
        )

        with open(os.path.join(self.config.report,'knee_elbow.txt'),'w') as f:
            f.write(f'optimal no of clusters (elbow) : {kl.elbow}')
        
        print(f"Optimal number of clusters (elbow): {kl.elbow}")
        joblib.dump(kl,os.path.join(self.config.cluster,'kneelocator.pkl'))

        # Plot the elbow graph
        plt.plot(k_range, sse, marker='o')
        plt.axvline(x=kl.elbow, color='red', linestyle='--', label=f'Elbow at k={kl.elbow}')
        plt.xlabel('Number of Clusters')
        plt.ylabel('SSE')
        plt.title('Elbow Method with KneeLocator')
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(self.config.report,"elbow_graph.png"))
        plt.clf()

       


                            



        


        

In [16]:
#pipeline
try : 
    config = ConfigurationManager()
    cluster_config = config.cluster_config()
    cluster_comp = ClusterComponent(config=cluster_config)
    cluster_comp.get_cluster()
except Exception as e:
    raise e

[2025-06-01 14:40:27,496: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-01 14:40:27,503: INFO: common: yaml file: params.yaml loaded successfully]
[2025-06-01 14:40:27,503: INFO: common: created directory at: artifacts]
[2025-06-01 14:40:27,513: INFO: common: created directory at: artifacts/cluster]
[2025-06-01 14:40:27,517: INFO: common: created directory at: artifacts/cluster/report]


Optimal number of clusters (elbow): 3


<Figure size 640x480 with 0 Axes>