In [None]:
import os
from dotenv import load_dotenv
from dataclasses import dataclass
from pathlib import Path
import json
from typing import List
import tensorflow as tf

In [2]:
os.getcwd()

'd:\\Projects\\E2E-ChestCancer-MlFlow-DVC\\research'

In [3]:
os.chdir('../')

## DATA INGESTION

In [4]:
load_dotenv()

kaggle_username = os.getenv('KAGGLE_USERNAME')
kaggle_key = os.getenv('KAGGLE_KEY')

from kaggle.api.kaggle_api_extended import KaggleApi

# dataset_url = os.getenv('DATASET_URL')
# dataset_path = os.getenv('DATASET_PATH')

# api = KaggleApi()
# api.authenticate()

# api.dataset_download_files(dataset_url, path=dataset_path, unzip=True)

In [5]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    unzip_dir: Path

In [6]:
from e2e_cnnClassifier_ChestCancer.constants import *
from e2e_cnnClassifier_ChestCancer.utils.utils import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH,
        params_filepath: str = PARAMS_FILE_PATH
    ):
        """
        Initializes the ConfigurationManager by loading configuration and parameter files.

        Args:
            config_filepath (str): Path to the config YAML file.
            params_filepath (str): Path to the params YAML file.
        """
        self.config = self._read_yaml(config_filepath)
        self.params = self._read_yaml(params_filepath)

        # self._print_config_and_params()

        self._create_directories([self.config.artificats_root])

    def _read_yaml(self, filepath: str):
        """Reads a YAML file and returns its contents."""
        return read_yaml(filepath)

    def _create_directories(self, dirs: list):
        """Creates directories if they don't exist."""
        create_directories(dirs)

    def _print_config_and_params(self):
        """Prints the contents of the config and params files."""
        print("Configuration Contents:")
        print(self.config)

        print("\nParameters Contents:")
        print(self.params)

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Prepares and returns the DataIngestionConfig object with necessary configurations.

        Returns:
            DataIngestionConfig: Configuration object for data ingestion.
        """
        config = self.config.data_ingestion

        # Ensure data ingestion root directory exists
        self._create_directories([config.root_dir])

        return DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            unzip_dir=config.unzip_dir
        )


In [8]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.api = KaggleApi()
        self.api.authenticate()
        self.metadata_path = os.path.join(self.config.unzip_dir, "dataset_metadata.json")

    def _record_metadata(self, folder_paths):
        """
        Records metadata about downloaded folders and files.
        """
        metadata = {
            "folders": {},
            "source_URL": self.config.source_URL
        }

        for folder_path in folder_paths:
            files = list(Path(folder_path).rglob("*.*"))
            metadata["folders"][folder_path] = len(files)

        with open(self.metadata_path, "w") as f:
            json.dump(metadata, f)

    def _load_metadata(self):
        """
        Loads existing metadata from the metadata file.
        """
        if not os.path.exists(self.metadata_path):
            return None
        
        with open(self.metadata_path, "r") as f:
            return json.load(f)

    def _check_if_download_needed(self):
        """
        Checks if the dataset needs to be downloaded based on the metadata.
        """
        existing_metadata = self._load_metadata()

        if not existing_metadata or existing_metadata["source_URL"] != self.config.source_URL:
            return True  

        for folder_path, file_count in existing_metadata["folders"].items():
            if not os.path.exists(folder_path):
                return True  
            if len(list(Path(folder_path).rglob("*.*"))) != file_count:
                return True  

        return False  

    def download_dataset(self):
        """
        Downloads the dataset only if it hasn't been downloaded or has been updated.
        """
        if self._check_if_download_needed():
            print("Downloading dataset...")
            self.api.dataset_download_files(self.config.source_URL, path=self.config.unzip_dir, unzip=True)

            folder_paths = [str(folder) for folder in Path(self.config.unzip_dir).glob("*") if folder.is_dir()]
            self._record_metadata(folder_paths)
            print("Dataset downloaded and metadata recorded.")
        else:
            print("Dataset already up-to-date, no download necessary.")


In [9]:
try:
    config_manager = ConfigurationManager()   
    data_ingestion_config = config_manager.get_data_ingestion_config()   
    data_ingestion = DataIngestion(config=data_ingestion_config)  
    data_ingestion.download_dataset()
except Exception as e:
    raise e

[2024-10-29 18:19:36,213: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2024-10-29 18:19:36,216: INFO: utils: yaml file: params.yaml loaded successfully]
[2024-10-29 18:19:36,217: INFO: utils: created directory at: artifacts]
[2024-10-29 18:19:36,218: INFO: utils: created directory at: artifacts/data_ingestion]
Dataset already up-to-date, no download necessary.


## Base model- Transfer learning

In [35]:
@dataclass
class BaseModelConfigAndParams:
    base_model_root_dir: str
    base_model_path: str
    updated_model_path: str
    
    model_name: str
    model_input_shape: List[int]
    model_num_classes: int
    
    data_train_data_dir: str
    data_batch_size: int
    data_image_size: List[int]
    
    training_weights: str
    training_include_top: bool
    training_epochs: int
    training_learning_rate: float
    training_optimizer: str

In [34]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH,
        params_filepath: str = PARAMS_FILE_PATH
    ):
        """
        Initializes the ConfigurationManager by loading configuration and parameter files.

        Args:
            config_filepath (str): Path to the config YAML file.
            params_filepath (str): Path to the params YAML file.
        """
        self.config = self._read_yaml(config_filepath)
        self.params = self._read_yaml(params_filepath)

        # self._print_config_and_params()

        # self._create_directories([self.config.artificats_root])

    def _read_yaml(self, filepath: str):
        """Reads a YAML file and returns its contents."""
        return read_yaml(filepath)

    def _create_directories(self, dirs: list):
        """Creates directories if they don't exist."""
        create_directories(dirs)

    def _print_config_and_params(self):
        """Prints the contents of the config and params files."""
        print("Configuration Contents:")
        print(self.config)

        print("\nParameters Contents:")
        print(self.params)

    def get_base_model_config(self) -> BaseModelConfigAndParams:
        """Creates and returns an instance of BaseModelConfigAndParams from the loaded configurations and parameters."""
        
        self._create_directories([Path(self.config['base_model']['root_dir'])])

        return BaseModelConfigAndParams(
            base_model_root_dir=Path(self.config['base_model']['root_dir']),
            base_model_path=Path(self.config['base_model']['base_model_path']),
            updated_model_path=Path(self.config['base_model']['updated_model']),
            
            model_name=self.params['model']['name'],
            model_input_shape=self.params['model']['input_shape'],
            model_num_classes=self.params['model']['num_classes'],
            
            data_train_data_dir=Path(self.params['data']['train_data_dir']),
            data_batch_size=self.params['data']['batch_size'],
            data_image_size=self.params['data']['image_size'],
            
            training_weights=self.params['training']['weights'],
            training_include_top=self.params['training']['include_top'],
            training_epochs=self.params['training']['epochs'],
            training_learning_rate=self.params['training']['learning_rate'],
            training_optimizer=self.params['training']['optimizer'],
        )

In [None]:
class BaseModel:
    def __init__(self, config: BaseModelConfigAndParams):
        self.config = config
        self.model = None  # Placeholder for the base model

    def download_and_save_base_model(self):
        """Downloads the base model and saves it to the specified path."""
        self.model = tf.keras.applications.EfficientNetB0(
            weights=self.config.training_weights,
            include_top=False,
            input_shape=self.config.model_input_shape
        )
        
        self.save_model(path=Path(self.config.base_model_path), model=self.model)

    @staticmethod
    def _prepare_full_model(model: tf.keras.Model, classes: int, freeze_all: bool, freeze_till: int, learning_rate: float) -> tf.keras.Model:
        """Prepares the full model by adding a custom classifier on top of the base model."""
        if freeze_all:
            for layer in model.layers:
                layer.trainable = False
        elif (freeze_till is not None) and (freeze_till > 0):
            for layer in model.layers[:-freeze_till]:
                layer.trainable = False

        flatten_in = tf.keras.layers.Flatten()(model.output)
        prediction = tf.keras.layers.Dense(
            units=classes,
            activation="softmax"
        )(flatten_in)

        full_model = tf.keras.models.Model(
            inputs=model.input,
            outputs=prediction
        )

        full_model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            loss=tf.keras.losses.CategoricalCrossentropy(),  
            metrics=["accuracy"] 
        )

        return full_model

    def update_base_model(self):
        """Updates the base model by preparing the full model and saving it."""
        self.full_model = self._prepare_full_model(
            model=self.model,
            classes=self.config.model_num_classes,
            freeze_all=True,
            freeze_till=None,
            learning_rate=self.config.training_learning_rate
        )

        self.save_model(path=Path(self.config.updated_model_path), model=self.full_model)

    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        """Saves the model to the specified path."""
        model.save(path)


In [37]:
try:
    config = ConfigurationManager()
    prepare_base_model_config = config.get_base_model_config()
    prepare_base_model = BaseModel(config=prepare_base_model_config)
    prepare_base_model.download_and_save_base_model()
    prepare_base_model.update_base_model()
except Exception as e:
    raise e

[2024-10-29 19:30:54,655: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2024-10-29 19:30:54,661: INFO: utils: yaml file: params.yaml loaded successfully]
[2024-10-29 19:30:54,662: INFO: utils: created directory at: artifacts\base_model]
