In [1]:
import os

In [2]:
%pwd

'/Users/karthiksridhar/Documents/End2End-ML-Project/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/karthiksridhar/Documents/End2End-ML-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
  root_dir: Path
  source_URL: str
  local_data_file: Path
  unzip_dir: Path


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
  def __init__(
      self, 
      config_filepath = CONFIG_FILE_PATH,
      params_filepath = PARAMS_FILE_PATH,
      schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

  def get_data_ingestion_config(self) -> DataIngestionConfig:
      config = self.config.data_ingestion

      data_ingestion_config = DataIngestionConfig(
          root_dir = config.root_dir,
          source_URL=config.source_URL, 
          local_data_file=config.local_data_file,
          unzip_dir= config.unzip_dir
      )

      return data_ingestion_config
            

In [22]:
import os
import requests
import zipfile
from mlProject import logger
from mlProject.utils.common import get_size

In [25]:
class DataIngestion:
  def __init__(self, config: DataIngestionConfig):
    self.config = config

  def download_file(self):
      # Ensure the directory 'artifacts/data_ingestion' exists
      os.makedirs(self.config.root_dir, exist_ok=True)

      # Define the path to save the downloaded file as 'data.zip'
      data_zip_path = self.config.local_data_file

      # Check if the file already exists
      if not os.path.exists(data_zip_path):
          # Download the file from the URL
          r = requests.get(url=self.config.source_URL, stream=True)

          # Save the file to the defined path
          with open(data_zip_path, 'wb') as fd:
              for chunk in r.iter_content(chunk_size=128):
                  fd.write(chunk)
          
          logger.info(f"File downloaded and saved as {data_zip_path}!")
      else:
          logger.info(f"{data_zip_path} already exists, with size: {get_size(Path(data_zip_path))}")

  
  def extract_zip_file(self):

    unzip_path = self.config.unzip_dir
    os.makedirs(unzip_path, exist_ok=True)
    with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
      zip_ref.extractall(unzip_path)



In [26]:
try: 
  config = ConfigurationManager()
  data_ingestion_config = config.get_data_ingestion_config()
  data_ingestion = DataIngestion(config=data_ingestion_config)
  data_ingestion.download_file()
  data_ingestion.extract_zip_file()
except Exception as e:
  raise e

[2024-10-17 05:56:53,317: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-10-17 05:56:53,320: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-17 05:56:53,321: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-17 05:56:53,322: INFO: common: created directory at: artifacts]
[2024-10-17 05:56:54,154: INFO: 2334480070: File downloaded and saved as artifacts/data_ingestion/data.zip!]


In [21]:
%pwd

'/Users/karthiksridhar/Documents/End2End-ML-Project'