In [1]:
import os

In [2]:
%pwd

'd:\\Desktop\\NLP\\Lab 1\\NLP-Tokenization-and-Language-Modeling\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Desktop\\NLP\\Lab 1\\NLP-Tokenization-and-Language-Modeling'

In [None]:
import zipfile
import os
import logging
from pathlib import Path
from dataclasses import dataclass  # Import dataclass decorator

# Configure the logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_data_file: Path
    unzip_dir: Path

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def get_size(self, directory: Path):
        # Calculate the size of the directory and format it
        size = sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())
        size_format = self.format_size(size)
        return size_format

    def format_size(self, size_in_bytes):
        size_in_bytes = float(size_in_bytes)
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size_in_bytes < 1024.0:
                return f"{size_in_bytes:.2f} {unit}"
            size_in_bytes /= 1024.0
        return f"{size_in_bytes:.2f} TB"

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

        # Remove the YAML file (data.yaml) after extraction
        yaml_file_path = unzip_path / "data.yaml"
        if yaml_file_path.exists():
            yaml_file_path.unlink()
        
        size = self.get_size(unzip_path)
        logger.info(f"Extraction completed in: {unzip_path}, Size: {size}")

try:
    # Provide the root directory path
    root_dir = Path("d:\\Desktop\\NLP\\Lab 1\\NLP-Tokenization-and-Language-Modeling")  # Update with your root directory path
    
    # Use the path to the dataset zip file
    dataset_zip_path = Path(root_dir / "Dataset" / "Angular.zip")
    
    # Define the unzip directory
    unzip_directory = Path(root_dir / "Dataset" / "UnzippedAngular")  # Update with your desired unzip directory
    
    # Create a DataIngestionConfig with the updated paths
    data_ingestion_config = DataIngestionConfig(
        root_dir=root_dir,
        local_data_file=dataset_zip_path,
        unzip_dir=unzip_directory
    )
    
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.extract_zip_file()
except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise e
