In [2]:
import os

In [3]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    train_data: str
    test_data: str
    transformed_train_data: str
    transformed_test_data: str
    num_features: list
    or_columns: list
    oh_columns: list
    transform_columns: list
    drop_columns: list
    target_column: str
    preprocessor: str


In [7]:
from us_visa.constants import *
from us_visa.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
   
    def __init__(self, config_filepath=CONFIG_FILE_PATH,schema_filepath = SCHEMA_FILe_PATH):

        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            train_data = config.train_data,
            test_data = config.test_data,
            transformed_train_data = config.transformed_train_data,
            transformed_test_data = config.transformed_test_data,
            preprocessor= config.preprocessor,
            num_features = self.schema.num_features,
            or_columns = self.schema.or_columns,
            oh_columns = self.schema.oh_columns,
            transform_columns = self.schema.transform_columns,
            drop_columns = self.schema.drop_columns,
            target_column = self.schema.target_column
           
        )

        return data_transformation_config





In [10]:
import pandas as pd
import numpy as np
from joblib import dump
from us_visa import logger
from us_visa.constants import CURRENT_YEAR,VALIDATION_STATUS


from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek, SMOTEENN

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    @staticmethod
    def load_data(file_path: str):
        return pd.read_csv(file_path)

        

    def data_analysis_and_transformation(self):
        '''
        This function performs data analysis and initial transformation

        '''

        # load data
        loaded_train_data = self.load_data(self.config.train_data)
        loaded_test_data = self.load_data(self.config.test_data)

        # check missing values
        train_features_with_na=[features for features in loaded_train_data.columns if loaded_train_data[features].isnull().sum()>=1]
        test_features_with_na=[features for features in loaded_test_data.columns if loaded_test_data[features].isnull().sum()>=1]
        logger.info(f"columns with missing values: {train_features_with_na}")
        logger.info(f"columns with missing values: {test_features_with_na}")

        # check duplicate values
        train_duplicates = loaded_train_data.duplicated().sum()
        test_duplicates = loaded_test_data.duplicated().sum()
        logger.info(f"number of duplicate rows in train data: {train_duplicates}")
        logger.info(f"number of duplicate rows in test data: {test_duplicates}")


        # Adding "company_age" column
        loaded_train_data["company_age"] = CURRENT_YEAR - loaded_train_data["yr_of_estab"]
        loaded_test_data["company_age"] = CURRENT_YEAR - loaded_test_data["yr_of_estab"]
        logger.info("Added company_age column to train and test data")

        # Drop columns
        drop_columns = self.config.drop_columns
        loaded_train_data = loaded_train_data.drop(columns=drop_columns, errors='ignore')
        loaded_test_data = loaded_test_data.drop(columns=drop_columns, errors='ignore')
        logger.info(f"Dropped columns: {drop_columns}")

        # X and y separation
        train_X = loaded_train_data.drop(self.config.target_column, axis=1)
        train_y = loaded_train_data[self.config.target_column]

        test_X = loaded_test_data.drop(self.config.target_column, axis=1)
        test_y = loaded_test_data[self.config.target_column]
        logger.info("Separated features and target variable")

        # Encoding labels manually
        train_y_encoded = np.where(train_y=="Certified", 1, 0)
        test_y_encoded = np.where(test_y=="Certified", 1, 0)
        logger.info("Encoded target variable")


        return train_X, train_y_encoded, test_X, test_y_encoded

    def transform(self):
        ''' 
        This function performs data transformation including scaling and encoding 
        '''
        # preprocessing
        train_X, train_y,test_X, test_y = self.data_analysis_and_transformation()

        # Define transformers for different types of features
        numeric_transformer = StandardScaler()
        oh_transformer = OneHotEncoder()
        ordinal_encoder = OrdinalEncoder()

        # initializing all the columns for transformation
        or_columns = self.config.or_columns
        oh_columns = self.config.oh_columns
        transform_columns = self.config.transform_columns
        num_features = self.config.num_features

        transform_pipe = Pipeline(steps=[
    ('transformer', PowerTransformer(method='yeo-johnson'))])
        preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, oh_columns),
        ("Ordinal_Encoder", ordinal_encoder, or_columns),
       ("Transformer", transform_pipe, transform_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)
        # Fit and transform the training data, transform the testing data
        X_train_transformed = preprocessor.fit_transform(train_X)
        X_test_transformed = preprocessor.transform(test_X)

        # save the preprocessor
        dump(preprocessor,self.config.preprocessor)

        
        logger.info("Transformed training and testing data")

        return X_train_transformed, train_y, X_test_transformed, test_y
    
    def handle_imbalanced_data(self):
        '''
        This function handles imbalanced data using SMOTETomek
        '''
        X_train_transformed, train_y, X_test_transformed, test_y = self.transform()

        smt = SMOTETomek(random_state=42)
        X_train_res, y_train_res = smt.fit_resample(X_train_transformed, train_y)
        logger.info("Resampled training data")

        return X_train_res, y_train_res, X_test_transformed, test_y

    def save_as_npz(self, X_train, y_train, X_test, y_test):
        np.savez(self.config.transformed_train_data, X=X_train, y=y_train)
        np.savez(self.config.transformed_test_data, X=X_test, y=y_test)
        logger.info(f"Saved transformed data to {self.config.transformed_train_data} and {self.config.transformed_test_data}")



In [12]:
status = read_yaml(VALIDATION_STATUS)


[2026-02-12 16:53:48,209: INFO: common: yaml file: artifacts\data_validation\validation_status.yml loaded successfully]


In [14]:
status.validation_status

True

In [None]:
STATUS = read_yaml(VALIDATION_STATUS)

if STATUS.validation_status == True:

    try:
        config_manager = ConfigurationManager()
        data_transformation_config = config_manager.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        X_train_res, y_train_res, X_test_res, y_test_res = data_transformation.handle_imbalanced_data()
        data_transformation.save_as_npz(X_train_res, y_train_res, X_test_res, y_test_res)
    except Exception as e:
        logger.exception(f"Error in data transformation: {e}")

[2026-02-12 17:09:55,653: INFO: common: yaml file: artifacts\data_validation\validation_status.yml loaded successfully]
[2026-02-12 17:09:55,681: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-12 17:09:55,691: INFO: common: yaml file: config\schema.yaml loaded successfully]
[2026-02-12 17:09:55,694: INFO: common: created directory at: artifacts]
[2026-02-12 17:09:55,697: INFO: common: created directory at: artifacts/data_transformation]
[2026-02-12 17:09:55,781: INFO: 3195190461: columns with missing values: []]
[2026-02-12 17:09:55,781: INFO: 3195190461: columns with missing values: []]
[2026-02-12 17:09:55,814: INFO: 3195190461: number of duplicate rows in train data: 0]
[2026-02-12 17:09:55,818: INFO: 3195190461: number of duplicate rows in test data: 0]
[2026-02-12 17:09:55,821: INFO: 3195190461: Added company_age column to train and test data]
[2026-02-12 17:09:55,821: INFO: 3195190461: Dropped columns: ['case_id', 'yr_of_estab']]
[2026-02-12 17:09:55,83