In [None]:
import os
os.chdir("..")
os.getcwd()

In [None]:
from package.utils import read_yaml
from dataclasses import dataclass


CONFIG = read_yaml("config/config.yaml")

@dataclass
class DataIngestionConstants:
    ARITFACTS_ROOT_DIR_NAME = CONFIG.ARITFACTS_ROOT_DIR_NAME
    DATA_ROOT_DIR_NAME = CONFIG.DATA.ROOT_DIR_NAME
    INGESTION_ROOT_DIR_NAME = CONFIG.DATA.INGESTION.ROOT_DIR

    FEATURE_STORE_ROOT_DIR_NAME = CONFIG.DATA.INGESTION.FEATURE_STORE.ROOT_DIR
    RAW_FILE_NAME = CONFIG.DATA.INGESTION.FEATURE_STORE.RAW_FILE_NAME

    INGESTED_ROOT_DIR_NAME = CONFIG.DATA.INGESTION.INGESTED.ROOT_DIR
    TRAIN_FILE_NAME = CONFIG.DATA.INGESTION.INGESTED.TRAIN_FILE_NAME
    TEST_FILE_NAME = CONFIG.DATA.INGESTION.INGESTED.TEST_FILE_NAME

    SPLIT_RATIO = 0.2
    DATABASE_NAME = "Network-Security"
    COLLECTION_NAME = "Data"



In [None]:
print("""
DataIngestionConstants
----------------------
      """)
print("ARITFACTS_ROOT_DIR_NAME: ", DataIngestionConstants.ARITFACTS_ROOT_DIR_NAME)
print("DATA_ROOT_DIR_NAME: ", DataIngestionConstants.DATA_ROOT_DIR_NAME)
print("INGESTION_ROOT_DIR_NAME: ", DataIngestionConstants.INGESTION_ROOT_DIR_NAME)
print("FEATURE_STORE_ROOT_DIR_NAME: ", DataIngestionConstants.FEATURE_STORE_ROOT_DIR_NAME)
print("INGESTED_ROOT_DIR_NAME: ", DataIngestionConstants.INGESTED_ROOT_DIR_NAME)
print("RAW_FILE_NAME: ", DataIngestionConstants.RAW_FILE_NAME)
print("TRAIN_FILE_NAME: ", DataIngestionConstants.TRAIN_FILE_NAME)
print("TEST_FILE_NAME: ", DataIngestionConstants.TEST_FILE_NAME)
print("SPLIT_RATIO: ", DataIngestionConstants.SPLIT_RATIO)
print("DATABASE_NAME: ", DataIngestionConstants.DATABASE_NAME)
print("COLLECTION_NAME: ", DataIngestionConstants.COLLECTION_NAME)

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class DataIngestionConfigEntity:
    ARITFACTS_ROOT_DIR_PATH:Path
    DATA_ROOT_DIR_PATH:Path
    INGESTION_ROOT_DIR_PATH:Path

    FEATURE_STORE_ROOT_DIR_PATH:Path
    RAW_FILE_PATH:Path

    INGESTED_ROOT_DIR_PATH:Path
    TRAIN_FILE_PATH:Path
    TEST_FILE_PATH:Path

    SPLIT_RATIO:float
    DATABASE_NAME:str
    COLLECTION_NAME:str




In [None]:
from dataclasses import dataclass
import os


@dataclass
class DataIngestionConfig:
    ARITFACTS_ROOT_DIR_PATH = DataIngestionConstants.ARITFACTS_ROOT_DIR_NAME
    DATA_ROOT_DIR_PATH = os.path.join(ARITFACTS_ROOT_DIR_PATH, DataIngestionConstants.DATA_ROOT_DIR_NAME)
    INGESTION_ROOT_DIR_PATH = os.path.join(DATA_ROOT_DIR_PATH, DataIngestionConstants.INGESTION_ROOT_DIR_NAME)

    FEATURE_STORE_ROOT_DIR_PATH = os.path.join(INGESTION_ROOT_DIR_PATH, DataIngestionConstants.FEATURE_STORE_ROOT_DIR_NAME)
    RAW_FILE_PATH = os.path.join(FEATURE_STORE_ROOT_DIR_PATH, DataIngestionConstants.RAW_FILE_NAME)

    INGESTED_ROOT_DIR_PATH = os.path.join(INGESTION_ROOT_DIR_PATH, DataIngestionConstants.INGESTED_ROOT_DIR_NAME)
    TRAIN_FILE_PATH = os.path.join(INGESTED_ROOT_DIR_PATH, DataIngestionConstants.TRAIN_FILE_NAME)
    TEST_FILE_PATH = os.path.join(INGESTED_ROOT_DIR_PATH, DataIngestionConstants.TEST_FILE_NAME)

    SPLIT_RATIO = DataIngestionConstants.SPLIT_RATIO
    DATABASE_NAME = DataIngestionConstants.DATABASE_NAME
    COLLECTION_NAME = DataIngestionConstants.COLLECTION_NAME




In [None]:
print("""
DataIngestionConfig
-------------------
      """)
print("ARITFACTS_ROOT_DIR_PATH: ", DataIngestionConfig.ARITFACTS_ROOT_DIR_PATH)
print("DATA_ROOT_DIR_PATH: ", DataIngestionConfig.DATA_ROOT_DIR_PATH)
print("INGESTION_ROOT_DIR_PATH: ", DataIngestionConfig.INGESTION_ROOT_DIR_PATH)
print("FEATURE_STORE_ROOT_DIR_PATH: ", DataIngestionConfig.FEATURE_STORE_ROOT_DIR_PATH)
print("RAW_FILE_PATH: ", DataIngestionConfig.RAW_FILE_PATH)
print("INGESTED_ROOT_DIR_PATH: ", DataIngestionConfig.INGESTED_ROOT_DIR_PATH)
print("TRAIN_FILE_PATH: ", DataIngestionConfig.TRAIN_FILE_PATH)
print("TEST_FILE_PATH: ", DataIngestionConfig.TEST_FILE_PATH)
print("SPLIT_RATIO: ", DataIngestionConfig.SPLIT_RATIO)
print("DATABASE_NAME: ", DataIngestionConfig.DATABASE_NAME)
print("COLLECTION_NAME: ", DataIngestionConfig.COLLECTION_NAME)

In [None]:
from package.exception import CustomException
from package.utils import create_dirs
from dataclasses import dataclass
from dotenv import load_dotenv
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
import pandas as pd
import sys
import os


@dataclass
class DataIngestionComponents:
    data_ingestion_config: DataIngestionConfigEntity

    @staticmethod
    def collection_to_dataframe(collection)->pd.DataFrame:
        """converts mongodb collection into pandas dataframe

        Args:
            collection (mongodb collection): collection which needs to convert into dataframe

        Returns:
            pandas.DataFrame
        """
        try:
            # collection mongodb collection 
            df = pd.DataFrame(collection.find())

            # converting mongodb collection into pandas dataframe
            df = df.drop("_id", axis=1)

            return df
        except Exception as e:
            raise CustomException(e, sys)

    def collect_data(self)->None:
        """collects data from data base and saves locally
        """
        try:
            # creating required directories
            create_dirs(self.data_ingestion_config.ARITFACTS_ROOT_DIR_PATH)
            create_dirs(self.data_ingestion_config.DATA_ROOT_DIR_PATH)
            create_dirs(self.data_ingestion_config.INGESTION_ROOT_DIR_PATH)
            create_dirs(self.data_ingestion_config.FEATURE_STORE_ROOT_DIR_PATH)
            create_dirs(self.data_ingestion_config.INGESTED_ROOT_DIR_PATH)

            # loading vulnarable variables
            load_dotenv()
            URI = os.getenv("URI")
            
            # connecting to mongodb
            client = MongoClient(URI)
            database_name = self.data_ingestion_config.DATABASE_NAME
            collection_name = self.data_ingestion_config.COLLECTION_NAME
            
            # collection mongodb collection
            collection = client[database_name][collection_name]

            # converting mongodb collection into pandas dataframe
            self.data_frame = self.collection_to_dataframe(collection)
            
            # saving data into local file path
            self.data_frame.to_csv(self.data_ingestion_config.RAW_FILE_PATH, index=False, header=True)

        except Exception as e:
            raise CustomException(e, sys)
    
    def get_splits(self)->None:
        """Divide the collected data into train and test and saves locally
        """
        try:
            split_ratio = self.data_ingestion_config.SPLIT_RATIO

            # getting train and test data according to split ratio
            train_data, test_data = train_test_split(self.data_frame, test_size=split_ratio, random_state=42)

            # saving train data into local file path
            train_data.to_csv(self.data_ingestion_config.TRAIN_FILE_PATH, index=False, header=True)

            # saving test data into local file path
            test_data.to_csv(self.data_ingestion_config.TEST_FILE_PATH, index=False, header=True)

        except Exception as e:
            raise CustomException(e, sys)

In [None]:
obj = DataIngestionComponents(data_ingestion_config=DataIngestionConfig)
obj.collect_data()
obj.get_splits()