In [2]:

import os

In [3]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    DB_NAME: str
    train_file_name: str
    test_file_name: str
    local_data_file: Path
    COLLECTION_NAME: str
    CONNECTION_URL: str

In [7]:
from us_visa.constants import *
from us_visa.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    ''' 
    This class is responsible for reading the config file and creating the necessary directories
        1. read the config file and return the config object
        2. create the necessary directories
        3. return the data ingestion config object
        4. return the data validation config object
        5. return the data transformation config object
        6. return the model trainer config object
        7. return the model evaluation config object
        8. return the model pusher config object
        9. return the training pipeline config object
    
        Note: The config file is in yaml format and is located at CONFIG_FILE_PATH

    '''
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            DB_NAME=config.DB_NAME,
            train_file_name=config.train_file_name,
            test_file_name=config.test_file_name,
            local_data_file=config.local_data_file,
            COLLECTION_NAME=config.COLLECTION_NAME,
            CONNECTION_URL= CONNECTION_URL
        )
        return data_ingestion_config

In [None]:
import pymongo
import pandas as pd

from us_visa import logger
from us_visa.exception import UsVisaException

In [None]:
class DataIngestion:

    '''
    This class is responsible for data ingestion from mongodb and saving it to local file.
     1. load data from mongodb and return as pandas dataframe
     2. drop _id column and save data to local file
     3. split data into train and test file and save it to local file

    '''

    def __init__(self, config: DataIngestionConfig):
        self.config = config

    # load data from mongodb and return as pandas dataframe

    def load_data(self) -> pd.DataFrame:
        try:
            client = pymongo.MongoClient(self.config.CONNECTION_URL)
            db = client[self.config.DB_NAME]
            collection = db[self.config.COLLECTION_NAME]
            data = list(collection.find())
            data = pd.DataFrame(data)
        except Exception as e:
            raise UsVisaException(e)
        return data
    
    # drop _id column and save data to local file

    def save_data(self, data: pd.DataFrame) ->pd.DataFrame:
        try:
            data.drop(columns=["_id"], inplace=True)
            data.to_csv(self.config.local_data_file, index=False)
            logger.info(f"Data saved successfully at {self.config.local_data_file}")
            logger.info(f"Data shape: {data.shape}")
        except Exception as e:
            raise UsVisaException(e)
        return data

    # split data into train and test file
    def split_data(self, data: pd.DataFrame) -> None:
        
        # split data into train and test file
        try:
            train_df = data.sample(frac=0.8, random_state=42)
            test_df = data.drop(train_df.index)
            # save train and test data to local file
            train_df.to_csv(os.path.join(self.config.root_dir, self.config.train_file_name), index=False)
            test_df.to_csv(os.path.join(self.config.root_dir, self.config.test_file_name), index=False)
            logger.info(f"Train and test data saved successfully at {self.config.root_dir}")
            logger.info(f"Train data shape: {train_df.shape}")
            logger.info(f"Test data shape: {test_df.shape}")
        except Exception as e:
            raise UsVisaException(e)    

        

In [11]:
from us_visa import logger

In [14]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data = data_ingestion.load_data()
    data_ingestion.save_data(data=data)
    data_ingestion.split_data(data=data)
    logger.info(f"Data ingestion completed successfully.")
except Exception as e:
    logger.error(f"Error during data ingestion: {e}")
    raise e

[2026-02-10 13:20:39,116: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-10 13:20:39,121: INFO: common: created directory at: artifacts]
[2026-02-10 13:20:39,121: INFO: common: created directory at: artifacts/data_ingestion]
[2026-02-10 13:22:19,307: INFO: 2290529942: Data saved successfully at artifacts/data_ingestion/EasyVisa.csv]
[2026-02-10 13:22:19,316: INFO: 2290529942: Data shape: (25480, 12)]
[2026-02-10 13:22:19,903: INFO: 2290529942: Train and test data saved successfully at artifacts/data_ingestion]
[2026-02-10 13:22:19,907: INFO: 2290529942: Train data shape: (20384, 12)]
[2026-02-10 13:22:19,914: INFO: 2290529942: Test data shape: (5096, 12)]
[2026-02-10 13:22:19,927: INFO: 1519029422: Data ingestion completed successfully.]
