In [1]:
import os

In [2]:
%pwd

'/Users/liangzhang/Documents/ai/dl-tutorial/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/liangzhang/Documents/ai/dl-tutorial'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
	root_dir: Path
	source_URL: str
	local_data_file: Path
	unzip_dir: Path

In [6]:
from template_dl_tutorial.constants import *
from template_dl_tutorial.utils.common import create_directories, read_yaml

In [7]:
class ConfigurationManager:
	def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
		self.config = read_yaml(config_filepath)
		self.params = read_yaml(params_filepath)

		create_directories([self.config.artifacts_root])

	def get_data_ingestion_config(self) -> DataIngestionConfig:
		config = self.config.data_ingestion
		create_directories([config.root_dir])
		return DataIngestionConfig(
			root_dir = config.root_dir,
			source_URL = config.source_URL,
			local_data_file = config.local_data_file,
			unzip_dir = config.unzip_dir
		)

In [8]:
import os
import zipfile
import gdown
from template_dl_tutorial import logger
from template_dl_tutorial.utils.common import get_size

In [9]:
class DataIngestion:
	def __init__(self, config: DataIngestionConfig) -> None:
		self.config = config
	def download_file(self) -> str:
		'''
		Fetch data from the url
		'''

		try:
			dataset_url=self.config.source_URL
			zip_download_dir=self.config.local_data_file
			os.makedirs("artifacts/data_ingestion", exist_ok=True)
			logger.info(f"Downloading data from {dataset_url} to {zip_download_dir}")
			file_id = dataset_url.split("/")[-2]
			prefix = "https://drive.google.com/uc?/export=download&id="
			gdown.download(prefix+file_id, zip_download_dir)
			logger.info(f"Data downloaded at {zip_download_dir}")

		except Exception as e:
			raise e
		
	def extract_zip_file(self):
		"""
		zip_file_path: str
		Extracts the zip file into the data director
		Function returns None
		"""
		unzip_path = self.config.unzip_dir
		os.makedirs(unzip_path, exist_ok=True)
		with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
			zip_ref.extractall(unzip_path)


In [11]:
try:
	config = ConfigurationManager()
	data_ingestion_config = config.get_data_ingestion_config()
	data_ingestion = DataIngestion(data_ingestion_config)
	data_ingestion.download_file()
	data_ingestion.extract_zip_file()
except Exception as e:
	raise e

[2024-05-29 16:31:38,824]: INFO: common: yaml file: config/config.yaml loaded successfully
[2024-05-29 16:31:38,825]: INFO: common: yaml file: params.yaml loaded successfully
[2024-05-29 16:31:38,826]: INFO: common: Created directory at artifacts
[2024-05-29 16:31:38,826]: INFO: common: Created directory at artifacts/data_ingestion
[2024-05-29 16:31:38,827]: INFO: 4277454294: Downloading data from https://drive.google.com/file/d/1VTpzsz_XDA55nnHc8JPvSxamBi5Np9c2/view?usp=sharing to artifacts/data_ingestion/data.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1VTpzsz_XDA55nnHc8JPvSxamBi5Np9c2
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1VTpzsz_XDA55nnHc8JPvSxamBi5Np9c2&confirm=t&uuid=fc9ebb75-fa13-442f-b5c2-c7277a117313
To: /Users/liangzhang/Documents/ai/dl-tutorial/artifacts/data_ingestion/data.zip
100%|██████████| 57.7M/57.7M [00:08<00:00, 6.83MB/s]

[2024-05-29 16:31:48,968]: INFO: 4277454294: Data downloaded at artifacts/data_ingestion/data.zip



