In [1]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [2]:
digits = load_digits()

In [3]:
data = pd.DataFrame(digits.data)
y = digits.target

In [4]:
data['target'] = y

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [6]:
# df_all = data.iloc[:, :-1]


In [7]:
# df_all

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)

In [9]:
type(X_test)

pandas.core.frame.DataFrame

In [10]:
X_train.columns

Index([       0,        1,        2,        3,        4,        5,        6,
              7,        8,        9,       10,       11,       12,       13,
             14,       15,       16,       17,       18,       19,       20,
             21,       22,       23,       24,       25,       26,       27,
             28,       29,       30,       31,       32,       33,       34,
             35,       36,       37,       38,       39,       40,       41,
             42,       43,       44,       45,       46,       47,       48,
             49,       50,       51,       52,       53,       54,       55,
             56,       57,       58,       59,       60,       61,       62,
             63, 'target'],
      dtype='object')

In [24]:
import sys
import os
from dataclasses import dataclass
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.utils.exception import CustomException
from src.utils.logger import get_logger
from src.utils.utility import save_object

logger = get_logger(__name__)


@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl')


class DataTransformation:
    def __init__(self):
        self.config = DataTransformationConfig()

    def get_data_transformer_obj(self, numerical_columns):
        try:

            numerical_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scalar', StandardScaler())
            ])

            preprocessor = ColumnTransformer(transformers=[
                ('numerical_pipeline', numerical_pipeline, numerical_columns)
            ])

            return preprocessor
        except Exception as e:
            raise CustomException(e, sys)
        
    def initiate_data_transformation(self, train_set, test_set, target_column):
        try:
            logger.info(f"Type: {type(test_set)}")

            logger.info("Train/Test data loaded successfully.")
            logger.info("Creating proprocessing object...")

            X_train = train_set.drop(target_column, axis=1)
            y_train = train_set[target_column]
            X_test = test_set.drop(target_column, axis=1)
            y_test = test_set[target_column]

            numerical_columns = X_train.columns.to_list()
            
            preprocessor = self.get_data_transformer_obj(numerical_columns)

            logger.info(f"Applying preprocessing object on train and test data.")

            X_train_array = preprocessor.fit_transform(X_train)
            X_test_array = preprocessor.transform(X_test)

            logger.info(f"Preprocessing completed. Training shape: {X_train_array.shape}")

            X_train_array = np.column_stack((X_train_array, y_train.to_numpy()))
            X_test_array = np.column_stack((X_test_array, y_test.to_numpy()))

            test = pd.DataFrame(X_test_array)

            save_object(
                file_path=self.config.preprocessor_obj_file_path,
                obj = preprocessor
            )

            logger.info("Preprocessing saved successfully!")

            return X_train_array, test
        
        except Exception as e:
            raise CustomException(e, sys)

            

In [25]:
# Data transformation
transformation = DataTransformation()
X_train, test = transformation.initiate_data_transformation(X_train, X_test, target_column='target')

[ 2026-01-03 16:30:00,293 ] 45 __main__ - INFO - Type: <class 'pandas.core.frame.DataFrame'>
[ 2026-01-03 16:30:00,294 ] 47 __main__ - INFO - Train/Test data loaded successfully.
[ 2026-01-03 16:30:00,295 ] 48 __main__ - INFO - Creating proprocessing object...
[ 2026-01-03 16:30:00,298 ] 59 __main__ - INFO - Applying preprocessing object on train and test data.
[ 2026-01-03 16:30:00,313 ] 64 __main__ - INFO - Preprocessing completed. Training shape: (1257, 64)
[ 2026-01-03 16:30:00,319 ] 17 src.utils.utility - INFO - Object saved sucessfully at: artifacts\preprocessor.pkl
[ 2026-01-03 16:30:00,320 ] 76 __main__ - INFO - Preprocessing saved successfully!


In [26]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0.0,-0.351631,-1.079464,-1.113006,0.032749,-1.030913,-0.400396,-0.129943,-0.059636,-0.621944,...,1.869361,-0.028217,-0.316460,-1.077192,-0.688142,0.434839,1.231819,0.709036,-0.191786,6.0
1,0.0,-0.351631,1.198929,0.974780,-0.908177,-1.030913,-0.400396,-0.129943,-0.059636,1.216826,...,-0.214707,-0.028217,-0.316460,1.433490,0.896512,-0.182547,-0.987493,-0.505874,-0.191786,9.0
2,0.0,-0.351631,0.577549,0.742804,0.032749,-0.312588,-0.400396,-0.129943,-0.059636,0.910364,...,-0.214707,-0.028217,-0.316460,1.433490,0.896512,0.640635,0.207521,-0.505874,-0.191786,3.0
3,0.0,-0.351631,-0.665211,0.046875,0.032749,1.124061,2.259430,1.705318,-0.059636,-0.621944,...,-0.214707,-0.028217,-0.316460,-0.497804,0.670133,-1.828912,-1.158209,-0.505874,-0.191786,7.0
4,0.0,1.836489,1.613182,0.974780,-0.437714,-1.030913,-0.400396,-0.129943,-0.059636,1.216826,...,-0.214707,-0.028217,3.064191,1.819749,0.896512,0.846430,0.548954,-0.262892,-0.191786,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,0.0,-0.351631,-1.079464,-2.736840,-1.378640,1.662805,0.190676,-0.129943,-0.059636,-0.621944,...,-0.214707,-0.028217,-0.316460,-1.077192,-2.725555,-0.799934,0.207521,-0.505874,-0.191786,4.0
536,0.0,-0.351631,0.784675,0.046875,0.738444,1.662805,0.190676,-0.129943,-0.059636,-0.315482,...,-0.214707,-0.028217,-0.316460,1.047231,-0.688142,-2.446298,-1.158209,-0.505874,-0.191786,7.0
537,0.0,-0.351631,-0.043831,0.742804,0.032749,-0.851332,-0.400396,-0.129943,-0.059636,-0.621944,...,-0.214707,-0.028217,-0.316460,-0.111545,0.670133,-0.182547,-0.475344,-0.505874,-0.191786,0.0
538,0.0,-0.351631,0.991802,0.974780,0.738444,-1.030913,-0.400396,-0.129943,-0.059636,0.603903,...,-0.214707,-0.028217,-0.316460,1.240361,-0.235384,-2.446298,-1.158209,-0.505874,-0.191786,7.0


In [30]:
a = np.array((1,2,3))

b = np.array((4,5,6))

c = np.column_stack((a,b))

In [31]:
a.shape, b.shape, c.shape

((3,), (3,), (3, 2))