In [1]:
import os

In [2]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject\\notebook'

In [3]:
# Change the current working directory to the project root
os.chdir("../")

In [4]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_data_file: Path
    local_data_file: Path

In [6]:
from defaultMlProj.constants.constant import *
from defaultMlProj.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_data_file=config.source_data_file,
            local_data_file=config.local_data_file
        )
        return data_ingestion_config

In [8]:
import os
import shutil
from defaultMlProj import logger
from defaultMlProj.utils.common import get_size


In [13]:
class DataIngestion:
    def __init__ (self, config: DataIngestionConfig):
        self.config = config

    def copy_data_file(self):

        source = Path(self.config.source_data_file)
        destination = Path(self.config.local_data_file)

        try:
            logger.info(f"Starting data ingestion:copying{source} to {destination}")

            destination.parent.mkdir(parents=True, exist_ok=True)

            if not source.exists():
                raise Exception(f"Source file {source.absolute()} does not exist")

            if destination.exists():
                logger.info(f"File destination {destination} already exists. Skipping copy.")
            else:
                shutil.copy(source, destination)
                logger.info(f"File copied successfully: {source} to {destination}")

        except Exception as e:
            logger.exception(f"Error occurred while copying data file: {e}")
            raise e

In [12]:
from pathlib import Path

# Define the expected path
source_path = Path("notebook/data/default.csv")

print("Current working directory:", Path(".").absolute())
print("Expected source path:", source_path.absolute())
print("Does file exist?", source_path.exists())

Current working directory: d:\End-to-end-Default-Risk-Pred-mlProject
Expected source path: d:\End-to-end-Default-Risk-Pred-mlProject\notebook\data\default.csv
Does file exist? True


In [14]:
# Updating the pipeline item on the workflow list
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.copy_data_file()
except Exception as e:
    raise e

[2025-07-22 23:44:21,641: INFO: common: YAML file config\config.yaml loaded successfully.]
[2025-07-22 23:44:21,645: INFO: common: YAML file params.yaml loaded successfully.]
[2025-07-22 23:44:21,647: INFO: common: Created directory: artifacts]
[2025-07-22 23:44:21,648: INFO: common: Created directory: artifacts/data_ingestion]
[2025-07-22 23:44:21,649: INFO: 653153596: Starting data ingestion:copyingnotebook\data\default.csv to artifacts\data_ingestion\default.csv]
[2025-07-22 23:44:21,652: INFO: 653153596: File copied successfully: notebook\data\default.csv to artifacts\data_ingestion\default.csv]


##### Stage two Data Validation

In [1]:
import os

In [2]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject'

In [5]:
import pandas as pd

In [7]:
df = pd.read_csv(r"artifacts/data_ingestion/default.csv", sep="\t")
df.head()

Unnamed: 0,credit_score,income,loan_amount,loan_term,interest_rate,debt_to_income_ratio,employment_years,savings_balance,age,default_risk_score
0,810,107410,11924,48,7.97,43.29,32,27181,58,7634.543366
1,418,37482,19291,24,6.94,11.01,33,15089,43,6249.833059
2,724,85641,39501,36,8.59,37.11,0,97459,33,2148.11799
3,444,73331,25714,36,13.09,33.39,18,2413,48,4979.385344
4,440,46723,35651,36,8.3,46.21,6,9716,42,2993.85195


In [8]:
df.shape

(800, 10)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   credit_score          800 non-null    int64  
 1   income                800 non-null    int64  
 2   loan_amount           800 non-null    int64  
 3   loan_term             800 non-null    int64  
 4   interest_rate         800 non-null    float64
 5   debt_to_income_ratio  800 non-null    float64
 6   employment_years      800 non-null    int64  
 7   savings_balance       800 non-null    int64  
 8   age                   800 non-null    int64  
 9   default_risk_score    800 non-null    float64
dtypes: float64(3), int64(7)
memory usage: 62.6 KB


In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    all_schema: dict

In [11]:
from defaultMlProj.constants.constant import *
from defaultMlProj.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.columns

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            
        )

        return data_validation_config

In [14]:
import os
from defaultMlProj import logger
from defaultMlProj.entity.config_entity import DataValidationConfig
import pandas as pd

class DataValidation:
    def __init__ (self, config: DataValidationConfig):
        self.config =config

    def validate_all_columns(self) -> bool:
        try:
            logger.info("Starting data validation: validating all columns")
            validation_status = None

            df = pd.read_csv(self.config.root_dir)
            all_cols = list(df.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}\n")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}\n")
            logger.info(f"Data validation completed with status: {validation_status}")

            return validation_status
        except Exception as e:
            logger.exception(f"Error occurred during data validation: {e}")
            raise e





In [None]:
# Pipeline creation
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

#### Note: Validation stage skipped in my defaultMlProj

##### Workflows
1. Update config.yaml
2. Update schema.yaml
3. Update params.yaml
4. Update entity
5. Update the configuration manager in src config
6. Update the components
7. Update the pipeline
8. Update the main.py
9. Update the app.py

#### Model transformation stage

In [16]:
import os
from pathlib import Path


@dataclass(frozen=True)
class DataTransformatonConfig:
    root_dir: Path
    data_path: Path


In [17]:
from defaultMlProj.constants.constant import *
from defaultMlProj.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformatonConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformatonConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

#### Model Transformation stage

In [1]:
import os

In [2]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from defaultMlProj.constants.constant import *
from defaultMlProj.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

        

In [12]:
# components/data_transformation.py

import os
import pandas as pd
from pathlib import Path
from defaultMlProj import logger
from sklearn.model_selection import train_test_split


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.target_column = "default_risk_score"  # actual target column name


    def train_test_split(self):
        df = pd.read_csv(self.config.data_path, sep='\t')

        try:
            logger.info("Starting data transformation: train-test split")
            logger.info(f"Full dataset shape: {df.shape}")

            # Validate target column exists
            if self.target_column not in df.columns:
                raise ValueError(f"Target column '{self.target_column}' not found in data. Columns: {list(df.columns)}")

            # Separate features and target
            X = df.drop(columns=[self.target_column])
            y = df[self.target_column]

            logger.info(f"Feature matrix X shape: {X.shape}")  # Should be (800, 9)
            logger.info(f"Target vector y shape: {y.shape}")   # Should be (800,)

            # Perform train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                test_size=0.2,
                random_state=42
            )

            logger.info(f"Train features shape: {X_train.shape}, Train target shape: {y_train.shape}")
            logger.info(f"Test features shape: {X_test.shape}, Test target shape: {y_test.shape}")

            # Recombine for saving (optional: keeps target in dataset)
            train_df = pd.DataFrame(X_train, columns=X.columns)
            train_df[self.target_column] = y_train.values

            test_df = pd.DataFrame(X_test, columns=X.columns)
            test_df[self.target_column] = y_test.values

            # Save to CSV
            train_csv_path = os.path.join(self.config.root_dir, "train.csv")
            test_csv_path = os.path.join(self.config.root_dir, "test.csv")

            train_df.to_csv(train_csv_path, index=False)
            test_df.to_csv(test_csv_path, index=False)

            logger.info(f"Train dataset saved to {train_csv_path}")
            logger.info(f"Test dataset saved to {test_csv_path}")

            return train_df, test_df

        except Exception as e:
            logger.exception(f"Error occurred during train-test split: {e}")
            raise e

In [13]:
# Pipeline creation
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_split()
except Exception as e:
    raise e

[2025-07-23 23:50:04,589: INFO: common: YAML file config\config.yaml loaded successfully.]
[2025-07-23 23:50:04,597: INFO: common: YAML file params.yaml loaded successfully.]
[2025-07-23 23:50:04,604: INFO: common: Created directory: artifacts]
[2025-07-23 23:50:04,608: INFO: common: Created directory: artifacts/data_transformation]
[2025-07-23 23:50:04,623: INFO: 1805267760: Starting data transformation: train-test split]
[2025-07-23 23:50:04,629: INFO: 1805267760: Full dataset shape: (800, 10)]
[2025-07-23 23:50:04,654: INFO: 1805267760: Feature matrix X shape: (800, 9)]
[2025-07-23 23:50:04,654: INFO: 1805267760: Target vector y shape: (800,)]
[2025-07-23 23:50:04,671: INFO: 1805267760: Train features shape: (640, 9), Train target shape: (640,)]
[2025-07-23 23:50:04,674: INFO: 1805267760: Test features shape: (160, 9), Test target shape: (160,)]
[2025-07-23 23:50:04,712: INFO: 1805267760: Train dataset saved to artifacts/data_transformation\train.csv]
[2025-07-23 23:50:04,717: INFO:

#### Model Trainer Stage

In [1]:
import os

In [2]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str

In [6]:
from defaultMlProj.constants.constant import *
from defaultMlProj.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
        )

        return model_trainer_config
    
    def get_params(self):
        """
        Returns the parameters loaded from params.yaml
        """
        return self.params

In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import xgboost
from defaultMlProj import logger
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import StackingRegressor    
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [17]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, params):
        self.config = config
        self.params = params
        self.target_column = params.target_column

    def create_model(self):
        # from sklearn.pipeline import Pipeline
        # from sklearn.preprocessing import StandardScaler
        # from sklearn.linear_model import LinearRegression
        # from sklearn.neighbors import KNeighborsRegressor
        # from sklearn.tree import DecisionTreeRegressor
        # from sklearn.ensemble import RandomForestRegressor, StackingRegressor
        try:
            logger.info("Started creating models")
            # Extract params
            p = self.params.model_params

            models = {}

            # Linear Regression
            models['LinearRegression'] = Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', LinearRegression(
                    fit_intercept=p.linear_regression.fit_intercept
                ))
            ])

            # KNN
            models['KNN'] = Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', KNeighborsRegressor(
                    n_neighbors=p.knn.n_neighbors,
                    weights=p.knn.weights,
                    algorithm=p.knn.algorithm
                ))
            ])

            # Decision Tree
            models['DecisionTree'] = DecisionTreeRegressor(
                criterion=p.decision_tree.criterion,
                max_depth=p.decision_tree.max_depth,
                min_samples_split=p.decision_tree.min_samples_split,
                min_samples_leaf=p.decision_tree.min_samples_leaf,
                random_state=p.decision_tree.random_state
            )
            
            # Random Forest
            models['RandomForest'] = RandomForestRegressor(
                n_estimators=p.random_forest.n_estimators,
                criterion=p.random_forest.criterion,
                max_depth=p.random_forest.max_depth,
                min_samples_split=p.random_forest.min_samples_split,
                min_samples_leaf=p.random_forest.min_samples_leaf,
                random_state=p.random_forest.random_state
            )

            # Stacking Regressor
            base_estimators = list(models.items())

            final_estimator = LinearRegression(
                fit_intercept=p.linear_regression.fit_intercept
            )

            stacking = StackingRegressor(
                estimators=base_estimators,
                final_estimator=final_estimator,
                cv=p.stacking_regressor.cv,
                n_jobs=p.stacking_regressor.n_jobs
            )

            models['Stacking Regressor'] = stacking
            logger.info(f"Models created: {list(models.keys())}")
            return models
        
        except Exception as e:
            logger.exception(f"Error occurred while creating models: {e}")
            raise e
        
    def train_and_evaluate(self):
        logger.info("Starting model training with external parameters")
        try:
            # Load data
            train_df = pd.read_csv(self.config.train_data_path, sep=',')
            test_df = pd.read_csv(self.config.test_data_path, sep=',')
            
            logger.info(f"Train data shape: {train_df.shape}, Test data shape: {test_df.shape}")

            X_train = train_df.drop(columns=[self.target_column])
            y_train = train_df[self.target_column]
            X_test = test_df.drop(columns=[self.target_column])
            y_test = test_df[self.target_column]

            logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

            # Create models using params
            models = self.create_model()

            # Get CV settings from params
            cv_params = self.params.cv_settings
            cv = KFold(
                n_splits=cv_params.n_splits,
                shuffle=cv_params.shuffle,
                random_state=cv_params.random_state
            )
            results = {}

            for name, model in models.items():
                try:
                    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
                    results[name] = scores
                    logger.info(f"{name} R2 = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
                except Exception as e:
                    logger.exception(f"Failed to evaluate {name}: {e}")
                    raise e
                
            # The best model
            best_name = max(results, key=lambda k: results[k].mean())
            best_model = models[best_name].fit(X_train, y_train)

            # Final evaluation
            y_pred = best_model.predict(X_test)
            test_r2 = r2_score(y_test, y_pred)
            test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

            logger.info(f"Best model: {best_name} | Test R2 : {test_r2:.4f}, RMSE : {test_rmse:.4f}")

            # Save model
            Path(self.config.model_name).parent.mkdir(parents=True, exist_ok=True)
            joblib.dump(best_model, self.config.model_name)
            logger.info(f"Model saved to {self.config.model_name}")

            return best_model, test_r2, test_rmse
        except Exception as e:
            logger.exception(f"Error occurred during model training and evaluation: {e}")
            raise e

In [19]:
# Pipeline creation
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    params = config.get_params()
    model_trainer = ModelTrainer(config=model_trainer_config, params=params)
    model_trainer.train_and_evaluate()
except Exception as e:
    logger.info(f"Error in model training pipeline: {e}")
    raise e

[2025-07-24 12:53:06,629: INFO: common: YAML file config\config.yaml loaded successfully.]
[2025-07-24 12:53:06,651: INFO: common: YAML file params.yaml loaded successfully.]
[2025-07-24 12:53:06,657: INFO: common: Created directory: artifacts]
[2025-07-24 12:53:06,663: INFO: common: Created directory: artifacts/model_trainer]
[2025-07-24 12:53:06,666: INFO: 1173455576: Starting model training with external parameters]


[2025-07-24 12:53:06,692: INFO: 1173455576: Train data shape: (640, 10), Test data shape: (160, 10)]
[2025-07-24 12:53:06,700: INFO: 1173455576: X_train shape: (640, 9), y_train shape: (640,)]
[2025-07-24 12:53:06,705: INFO: 1173455576: X_test shape: (160, 9), y_test shape: (160,)]
[2025-07-24 12:53:06,712: INFO: 1173455576: Started creating models]
[2025-07-24 12:53:06,720: INFO: 1173455576: Models created: ['LinearRegression', 'KNN', 'DecisionTree', 'RandomForest', 'Stacking Regressor']]
[2025-07-24 12:53:06,829: INFO: 1173455576: LinearRegression R2 = 1.0000 (+/- 0.0000)]
[2025-07-24 12:53:06,942: INFO: 1173455576: KNN R2 = 0.8425 (+/- 0.0246)]
[2025-07-24 12:53:07,101: INFO: 1173455576: DecisionTree R2 = 0.9993 (+/- 0.0003)]
[2025-07-24 12:53:10,613: INFO: 1173455576: RandomForest R2 = 0.9995 (+/- 0.0001)]
[2025-07-24 12:53:31,131: INFO: 1173455576: Stacking Regressor R2 = 1.0000 (+/- 0.0000)]
[2025-07-24 12:53:31,146: INFO: 1173455576: Best model: LinearRegression | Test R2 : 1.00

In [25]:
train_df = pd.read_csv(model_trainer_config.train_data_path, sep='\t')
print("Columns in train_df:", train_df.columns.tolist())

Columns in train_df: ['credit_score,income,loan_amount,loan_term,interest_rate,debt_to_income_ratio,employment_years,savings_balance,age,default_risk_score']


#### Model Evaluation Stage

In [1]:
import os

In [2]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\End-to-end-Default-Risk-Pred-mlProject'