In [1]:
import os

In [3]:
%pwd

'c:\\code\\ML\\breast_cancer\\research'

In [4]:
os.chdir("../")

In [None]:
%pwd

'c:\\code\\ML\\breast_cancer'

In [7]:
!pip install scikit-learn





In [None]:


from sklearn.model_selection import train_test_split

In [9]:
#config entity


In [37]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class DataTransform:
    df_path : Path
    # transformed_df_path : Path
    test_size : int
    random_state : int
    smote_k_neighbours :int
    n_pca_components: int
    scaler_path : Path
    # pca_model_path : Path


In [12]:
#configuration

In [31]:
from breast_cancer.entity.config_entity import DataIngestionConfig , PreProcessing , EDAconfig , DataTransform
from breast_cancer.utils.common import read_yaml , create_directories
from breast_cancer.constants import *
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir= config.root_dir,
            source_url= config.source_url,
            local_data_file= config.local_data_file,
            
        )

        return data_ingestion_config
    
    def preprocessing(self) -> PreProcessing:
        params = self.params.preprocessing
        config = self.config.preprocessing
        
        create_directories([config.df_pre_dir])
        pre_processing = PreProcessing(df_pre_dir=config.df_pre_dir,drop_columns=params.drop_columns,axis = params.axis)
        
        return pre_processing

    def EDA_configuration(self) -> EDAconfig:
        config = self.config.EDA_config
        create_directories([config.report_path])

        eda_config = EDAconfig(report_path=config.report_path,df_clean_path= config.df_clean_path)

        return eda_config

    
    def data_transformation_config(self) ->  DataTransform:
        config  = self.config.DataTransform
        param = self.params.DataTransform
        # create_directories([config.transformed_df_path]) 
        create_directories([config.scaler_path])
        # create_directories([config.pca_model_path])

        data_tra_config = DataTransform(df_path = config.df_path,
                                        test_size=param.test_size,random_state=param.random_state,smote_k_neighbours= param.smote_k_neighbours,
                                        n_pca_components=param.n_pca_components,scaler_path= config.scaler_path,
                                       )
                                 
                                        
        return data_tra_config

    

In [32]:
#components


In [33]:
import imblearn


In [34]:
from breast_cancer import logger
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
# from sklearn.decomposition import PCA
import joblib


class DataTransformation:
    def __init__(self,config : DataTransform):
        self.config = config

    def load_data(self):
        return pd.read_csv(self.config.df_path)
    
    def get_data_transformation(self):
        try:
            df = self.load_data()
            x = df.drop(['diagnosis'],axis=1)
            y = df['diagnosis'].replace({'M':1,'B':0})
            #split
            logger.info("train test split")
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = self.config.test_size,random_state=self.config.random_state)
            
            #scaling

            logger.info("Standard Scaling")
            scaler = StandardScaler()
            x_train_scaled = scaler.fit_transform(x_train)
            joblib.dump(scaler , os.path.join(self.config.scaler_path,"scaler.pkl"))
            x_test_scaled = scaler.transform(x_test)

            #SMOTE
            logger.info("synthetic minority over sampling technique")
            smote = SMOTE(k_neighbors=self.config.smote_k_neighbours,random_state=self.config.random_state)
            x_res , y_res = smote.fit_resample(x_train_scaled,y_train)


            #pca 

            # logger.info("principal component analysis")

            # pca = PCA(n_components=self.config.n_pca_components,random_state=self.config.random_state)

            # x_train_pca = pca.fit_transform(x_res)
            # joblib.dump(pca,os.path.join(self.config.pca_model_path,"pca.pkl"))
            # x_test_pca = pca.transform(x_test)

            return x_res,x_test_scaled, y_res, y_test

        except Exception as e:
            raise e
            

        



In [35]:
#pipeline

In [38]:
try :
    config = ConfigurationManager()
    data_tra_config = config.data_transformation_config()
    data_transformation = DataTransformation(config=data_tra_config)
    x_train, x_test, y_train, y_test = data_transformation.get_data_transformation()

except Exception as e:
    raise e

[2025-05-06 17:17:02,113: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-06 17:17:02,119: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-06 17:17:02,123: INFO: common: created directory at: artifacts]
[2025-05-06 17:17:02,126: INFO: common: created directory at: artifacts/models/scaler]
[2025-05-06 17:17:02,219: INFO: 1744828144: train test split]
[2025-05-06 17:17:02,225: INFO: 1744828144: Standard Scaling]
[2025-05-06 17:17:02,240: INFO: 1744828144: synthetic minority over sampling technique]


  y = df['diagnosis'].replace({'M':1,'B':0})


In [22]:
x_train

array([[ 3.09599095,  9.01906107,  2.71430853, -0.51922381,  4.30995322],
       [ 8.33012077,  1.3381213 , -2.96536296,  0.68349725,  0.38951814],
       [-3.09864942,  3.03124793,  3.32676384, -0.13206642, -0.99722258],
       ...,
       [ 2.71585201, -1.4088932 , -2.00219031,  1.91227763,  0.85945067],
       [ 3.42666745, -0.96322651,  1.12084589,  0.2843452 , -0.53900591],
       [ 3.51057782, -2.62022778,  1.33126719,  0.43539446, -0.69789864]],
      shape=(572, 5))

In [25]:
len(x_test)

114