In [2]:
import pandas as pd

In [3]:
import os
%pwd

'c:\\code\\ML\\breast_cancer\\research'

In [4]:
os.chdir("../")

In [5]:
pwd

'c:\\code\\ML\\breast_cancer'

In [6]:
df_clean = pd.read_csv("artifacts\preprocessing\df_clean.csv",delimiter = ",")


In [26]:
df_clean.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
#entity

In [27]:
from dataclasses import dataclass
from pathlib import Path

In [28]:
@dataclass
class EDAconfig:
    report_path : Path
    df_clean_path : Path
    


In [10]:
# src / config / configuration

In [11]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), "src"))


In [29]:
from breast_cancer.constants import  *

from breast_cancer.utils.common import read_yaml , create_directories

In [33]:
from breast_cancer.entity.config_entity import DataIngestionConfig , PreProcessing

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir= config.root_dir,
            source_url= config.source_url,
            local_data_file= config.local_data_file,
            
        )

        return data_ingestion_config
    
    def preprocessing(self) -> PreProcessing:
        params = self.params.preprocessing
        config = self.config.preprocessing
        
        create_directories([config.df_pre_dir])
        pre_processing = PreProcessing(df_pre_dir=config.df_pre_dir,drop_columns=params.drop_columns,axis = params.axis)
        
        return pre_processing

    def EDA_configuration(self) -> EDAconfig:
        config = self.config.EDA_config
        create_directories([config.report_path])

        eda_config = EDAconfig(report_path=config.report_path,df_clean_path= config.df_clean_path)

        return eda_config


    

In [14]:
#components

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from breast_cancer import logger


class EDA :
    def __init__(self,config:EDAconfig):
        self.config = config
    
    def load_data(self):
        return pd.read_csv(self.config.df_clean_path)
    
    def perform_eda(self):
        try:
            df = self.load_data()
            logger.info("df_cleaned loaded as df")

            # os.makedirs(self.config.report_path,exist_ok=True)

            #basic_info
            with open(os.path.join(self.config.report_path,"basic_info.txt"),"w") as f:
                df.info(buf=f)
                f.write("\n\n")
                f.write(str(df.describe()))

            logger.info("basic info written")

            #class distribution or count plot
            sns.countplot(data=df,x = "diagnosis")
            plt.title("class distribution")
            plt.savefig(os.path.join(self.config.report_path,"countplot.png"))
            plt.clf()
            
            logger.info("class distribution graphs")


            #correlation
            plt.figure(figsize=(12,10))
            sns.heatmap(df.drop("diagnosis",axis=1).corr(),cmap="coolwarm",annot=False)
            plt.title("collinearity between features")
            plt.savefig(os.path.join(self.config.report_path,"correlation.png"))
            plt.clf()

            logger.info("correlation")

            #histograms / distribution plots
            df.drop("diagnosis",axis=1).hist(figsize=(16,12))
            plt.title("distribution plots")
            plt.savefig(os.path.join(self.config.report_path,"distribution_plots.png"))
            plt.clf()

            logger.info("histograms")

            #Boxplots
            plt.figure(figsize=(10,6))
            sns.boxplot(data=df,orient="h")
            plt.title("Box Plots")
            plt.savefig(os.path.join(self.config.report_path,"boxplots.png"))
            plt.clf() 

            logger.info("Boxplots")

            #pairplot
            sns.pairplot(df,hue="diagnosis")   
            # plt.title("Pair Plots")
            plt.savefig(os.path.join(self.config.report_path,"pairplot.png"))
            plt.clf()
            
            logger.info("pairplot")

            #violin plots
            for col in df.columns:
                if col=="diagnosis":
                    continue
                sns.violinplot(data=df,x="diagnosis",y=col)
                plt.title(f"violinplot-{col}")
                plt.savefig(os.path.join(self.config.report_path,f"violin_{col}.png"))
                plt.clf()

            logger.info("violin plots")

            #feature vs target bar plots

            df.groupby("diagnosis").mean().T.plot(kind="bar",figsize=(12,6))
            plt.title("feature vs target bar plot")
            plt.savefig(os.path.join(self.config.report_path,"feature_vs_target_barplot.png"))
            plt.clf()

            logger.info("done with EDA...loaded the entire analysis to report folder in artifacts")

        except Exception as e:
            raise e
            




    



In [None]:
#pipeline


In [38]:
from breast_cancer.components.data_ingestion import DataIngestion
from breast_cancer.components.pre_processing import DataPreProcess



try :
    #ingestion
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config= data_ingestion_config)
    csv_path =  Path(data_ingestion.download_file())
    df_initial = pd.read_csv(csv_path)

    # preprocess
    pre_processing_config = config.preprocessing()
    data_preprocess = DataPreProcess(config= pre_processing_config)
    df_cleaned = data_preprocess.get_preprocess_data(df = df_initial)
    

    #
    eda_config = config.EDA_configuration()
    eda = EDA(config=eda_config)
    eda.perform_eda()



except Exception as e :
    raise e 

[2025-05-01 17:37:07,518: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-01 17:37:07,524: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-01 17:37:07,528: INFO: common: created directory at: artifacts]
[2025-05-01 17:37:07,531: INFO: common: created directory at: artifacts/data_ingestion]
[2025-05-01 17:37:07,534: INFO: data_ingestion: Starting download from https://raw.githubusercontent.com/kartheek2003/data_zip_repo/refs/heads/main/data.csv...]
[2025-05-01 17:37:07,846: INFO: data_ingestion: Downloaded file to: artifacts/data_ingestion/data.csv]
[2025-05-01 17:37:07,939: INFO: common: created directory at: artifacts/preprocessing]
[2025-05-01 17:37:07,954: INFO: pre_processing: data frame preprocessing]
[2025-05-01 17:37:08,048: INFO: pre_processing: df_cleaned saved at artifacts/preprocessing]
[2025-05-01 17:37:08,050: INFO: common: created directory at: artifacts/report]
[2025-05-01 17:37:08,088: INFO: 2672153331: df_cleaned loaded as d

<Figure size 640x480 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1600x1200 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 7576.74x7500 with 0 Axes>

<Figure size 1200x600 with 0 Axes>