In [1]:
import os

In [2]:
os.getcwd()

'c:\\Users\\hasan\\Artificial-Intelligence\\projects\\Resume ATS Score Checker\\notebook'

In [3]:
os.chdir("..")

In [4]:
os.getcwd()

'c:\\Users\\hasan\\Artificial-Intelligence\\projects\\Resume ATS Score Checker'

## config.raw

## constants

##### schema

In [5]:
from pydantic import BaseModel, Field 


class DataIngestionConstants(BaseModel):
    ROOT_DIR_NAME: str = Field(frozen=True) 
    DATA_ROOT_DIR_NAME: str = Field(frozen=True) 
    INGESTION_ROOT_DIR_NAME: str = Field(frozen=True) 
    RAW_DATA_DIR_NAME: str = Field(frozen=True) 
    PARSED_DATA_DIR_NAME: str = Field(frozen=True) 
    FINAL_DATA_DIR_NAME: str = Field(frozen=True) 


__all__ = ["DataIngestionConstants", ]

##### values

In [None]:
# from .schema import *
from src.ats.exception import CustomException 
from typing import List, Tuple, Dict
from box import ConfigBox
import sys 


def __ing__(CONFIG:ConfigBox):
    return DataIngestionConstants(
        ROOT_DIR_NAME = CONFIG.ROOT_DIR, 
        DATA_ROOT_DIR_NAME = CONFIG.DATA.ROOT_DIR, 
        INGESTION_ROOT_DIR_NAME = CONFIG.DATA.INGESTION.ROOT_DIR, 
        RAW_DATA_DIR_NAME = CONFIG.DATA.INGESTION.RAW_DATA_DIR, 
        PARSED_DATA_DIR_NAME = CONFIG.DATA.INGESTION.PARSED_DATA_DIR, 
        FINAL_DATA_DIR_NAME = CONFIG.DATA.INGESTION.FINAL_DATA_DIR 
    )

avl_cons = ["DataIngestion", ]
process = {
    "DataIngestion":__ing__
} 

def load(config:ConfigBox, name: str | List[str] | Tuple[str]) -> Dict: 
    """loads respective constants for the given name

    Args:
        config (ConfigBox): configuration for the object
        name (str | List[str] | Tuple[str]): name of required object  

        Note: Available name --> DataIngestion, 

    Raises:
        CustomException: Error shows with file name, line no and error message

    Returns:
        Dict: key = name of object used to load given in variable \'name\', 
        
              value = Object of the name used to load,

              example:
              output = load(config, "DataIngestion")
              output = { "DataIngestion" : DataIngestionConstants } 
              data_ingestion_constants = output["DataIngestion"] 
    """
    reqs = []
    try:
        # validate type   
        if isinstance(name, str):
            reqs.append(name) 
        elif isinstance(name, List) or isinstance(name, Tuple):
            reqs += name 
        else:
            ValueError(f"Unsupported type {{{type(name)}}} for variable {{name}}") 

        # validate values 
        for req in reqs:
            if req not in avl_cons:
                ValueError(f"Unknown value provided in variable \'name\', {req}, name can only have values from {avl_cons}") 

        # run respective functions and return the output 
        output = {}
        for req in reqs: 
            func = process[req] 
            output[req] = func(config)

        return output
    except Exception as e: 
        raise CustomException(e, sys) 


__all__ = ["load"]

##### __init__

In [None]:
# from .values import *
from src.ats.utils import load_yaml
from typing import Dict 



CONFIG = load_yaml("src\\ats\\config\\raw\\config.yaml") 

def load_constants(name: str | list[str] | tuple[str]) -> Dict:
    """loads respective constants for the given name

    Args:
        name (str | list[str] | tuple[str]): name of required object 

        Note: Available name --> DataIngestion, 

    Returns:
        Dict: key = name of object used to load given in variable \'name\', 

              value = Object of the name used to load,

              example:
              output = load_constants("DataIngestion")
              output = { "DataIngestion" : DataIngestionConstants } 
              data_ingestion_constants = output["DataIngestion"] 
    """
    return load(CONFIG, name)


__all__ = ["load_constants", "CONFIG"]

In [8]:
constants = load_constants("DataIngestion")["DataIngestion"]
print(f"ROOT_DIR_NAME:{constants.ROOT_DIR_NAME}")
print(f"DATA_ROOT_DIR_NAME:{constants.DATA_ROOT_DIR_NAME}")
print(f"INGESTION_ROOT_DIR_NAME:{constants.INGESTION_ROOT_DIR_NAME}")
print(f"RAW_DATA_DIR_NAME:{constants.RAW_DATA_DIR_NAME}")
print(f"PARSED_DATA_DIR_NAME:{constants.PARSED_DATA_DIR_NAME}") 
print(f"FINAL_DATA_DIR_NAME:{constants.FINAL_DATA_DIR_NAME}") 

ROOT_DIR_NAME:artifacts
DATA_ROOT_DIR_NAME:data
INGESTION_ROOT_DIR_NAME:ingestion
RAW_DATA_DIR_NAME:raw
PARSED_DATA_DIR_NAME:parsed
FINAL_DATA_DIR_NAME:final


## entity

In [9]:
from pydantic import BaseModel 
from pathlib import Path 


class DataIngestion(BaseModel):
    ROOT_DIR_PATH: Path 
    DATA_ROOT_DIR_PATH: Path 
    INGESTION_ROOT_DIR_PATH: Path 
    RAW_DATA_DIR_PATH: Path 
    PARSED_DATA_DIR_PATH: Path  
    FINAL_DATA_DIR_PATH: Path


__all__ = ["DataIngestion", ]

## config

##### builder/__init__

In [None]:
# from src.ats.constants import load_constants 
# update __all__ 

# from src.ats.constants import * 
from dataclasses import dataclass 
from pathlib import Path 
import os 


constants = load_constants(["DataIngestion", ])

@dataclass(frozen=True)
class DataIngestionConfig:
    CONFIG = constants["DataIngestion"]
    ROOT_DIR_PATH = Path(CONFIG.ROOT_DIR_NAME)
    DATA_ROOT_DIR_PATH = Path(os.path.join(ROOT_DIR_PATH, CONFIG.DATA_ROOT_DIR_NAME))
    INGESTION_ROOT_DIR_PATH = Path(os.path.join(DATA_ROOT_DIR_PATH, CONFIG.INGESTION_ROOT_DIR_NAME))
    RAW_DATA_DIR_PATH = Path(os.path.join(INGESTION_ROOT_DIR_PATH, CONFIG.RAW_DATA_DIR_NAME))
    PARSED_DATA_DIR_PATH = Path(os.path.join(INGESTION_ROOT_DIR_PATH, CONFIG.PARSED_DATA_DIR_NAME)) 
    FINAL_DATA_DIR_PATH = Path(os.path.join(INGESTION_ROOT_DIR_PATH, CONFIG.FINAL_DATA_DIR_NAME)) 


__all__ = ["DataIngestionConfig", ]

In [11]:
print(f"ROOT_DIR_PATH: {DataIngestionConfig.ROOT_DIR_PATH}")
print(f"DATA_ROOT_DIR_PATH: {DataIngestionConfig.DATA_ROOT_DIR_PATH}")
print(f"INGESTION_ROOT_DIR_PATH: {DataIngestionConfig.INGESTION_ROOT_DIR_PATH}")
print(f"RAW_DATA_DIR_PATH: {DataIngestionConfig.RAW_DATA_DIR_PATH}")
print(f"PARSED_DATA_DIR_PATH: {DataIngestionConfig.PARSED_DATA_DIR_PATH}")
print(f"FINAL_DATA_DIR_PATH: {DataIngestionConfig.FINAL_DATA_DIR_PATH}")

ROOT_DIR_PATH: artifacts
DATA_ROOT_DIR_PATH: artifacts\data
INGESTION_ROOT_DIR_PATH: artifacts\data\ingestion
RAW_DATA_DIR_PATH: artifacts\data\ingestion\raw
PARSED_DATA_DIR_PATH: artifacts\data\ingestion\parsed
FINAL_DATA_DIR_PATH: artifacts\data\ingestion\final


##### __init__

In [12]:
# from .builder import * 

## components

##### data_ingestion

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx 
from unstructured.partition.html import partition_html 
from src.ats.utils import save_file, create_dirs
from src.ats.exception import CustomException 
from src.ats import logging 
from pydantic import BaseModel 
from fastapi import UploadFile 
from string import punctuation 
# from src.ats.entity import * 
from pathlib import Path 
from typing import Dict 
import sys 


class DataIngestionComponents(BaseModel): 
    data_ingestion_config: DataIngestion 

    def __load(self, files:List[UploadFile]) -> Dict[str, Path]: 
        """loads data, stores locally and returns name with path

        Args:
            files (List[UploadFile]): list object of fastapi.UploadFile / files that have been uploaded

        Raises:
            ValueError: if no files have been provided as argument

        Returns:
            Dict:
            key = name of file 

            value = path of file 

            example:
            output = __load(files)
            output = {
                "xyz.pdf" : "path\\to\\the\\file", 
                "abc.docx": "path\\to\\the\\file", 
                ...
            } 
            name = output.keys()[0]
            path = output[name]
        """
        try:
            logging.info("In __load")
            files_len = len(files)
            if files_len > 0: 
                output = {} 
                for file in files:
                    file_name = file.filename 
                    file_name = file_name.strip().lower() 
                    logging.info(f"working with \'{file_name}\'")
                    path = os.path.join(self.data_ingestion_config.RAW_DATA_DIR_PATH, file_name)
                    # save file to local  
                    save_file(file.file.read(), path)
                    output[file_name] = path 
                    logging.info(f"\'{file_name}\' saved at \'{path}\'")
                else: 
                    raise ValueError(f"{len(files_len)} files recieved.")
            logging.info("Out __load")
            return output 
        except Exception as e: 
            logging.error(e) 
            raise CustomException(e, sys) 
        
    def __parse(self, info:Dict[str, Path]) -> Dict[str, str]: 
        """parse the data of file through unstructured, supported extentions ---> [ .pdf, .docx, .html ]

        Args:
            info (Dict[str, Path]): 

                key = name of file 

                value = path of file 

                example:
                info = {
                    "xyz.pdf" : "path\\to\\the\\file", 
                    "abc.docx": "path\\to\\the\\file", 
                    ...
                } 
                output = __parse(info)

        Raises:
            ValueError: if provided file with uncompatable format as argument 

        Returns:
            Dict: 
            key = name of file 

            value = string object of parsed data  

            example:
            output = __parse(info)
            output = {
                "xyz.pdf" : "string_parsed_data_of_xyz.pdf", 
                "abc.docx": "string_parsed_data_of_abc.docx", 
                ...
            } 
            name = output.keys()[0]
            data = output[name] 
        """
        try:
            logging.info("In __parse")
            output = {}
            for file_name in info.keys():
                logging.info(f"parsing \'{file_name}\'")
                ext = os.path.splitext(file_name)[1].lower()
                # get partitioner 
                if ext == ".pdf":
                    partition = partition_pdf 
                    partitioner_type = "partition_pdf"
                elif ext == ".docx":
                    partition = partition_docx
                    partitioner_type = "partition_docx"
                elif ext == ".html":
                    partition = partition_html
                    partitioner_type = "partition_html"
                else:
                    raise ValueError(f"Unsupported file type: {file_name}") 
                del ext
                logging.info(f"using \'{partitioner_type}\'") 
                del partitioner_type
                path = info[file_name]
                logging.info(f"path of file for partitioning \'{path}\'") 
                elements = partition(path)
                del partition 
                elements_string = "\n\n".join([str(el) for el in elements]) 
                del elements 
                logging.info("partitioning complete.")
                # save file to local 
                path = os.path.join(self.data_ingestion_config.PARSED_DATA_DIR_PATH, file_name)
                save_file(elements_string, path)
                del path 
                output[file_name] = elements_string 
                del elements_string 
            logging.info("Out __parse")
            return output 
        except Exception as e: 
            logging.error(e) 
            raise CustomException(e, sys) 
        
    def __clean(self, info:Dict[str, str]) -> Dict[str, str]: 
        """removes punctuations from parsed data 

        Args:
            info (Dict[str, str]): 
            
                key = name of file 

                value = string object of parsed data  

                example:
                info = {
                    "xyz.pdf" : "string_parsed_data_of_xyz.pdf", 
                    "abc.docx": "string_parsed_data_of_abc.docx", 
                    ...
                } 
                output = __clean(info)

        Returns:
            Dict: 
            key = name of file 

            value = cleaned string object of parsed data  

            example:
            output = __clean(info)
            output = {
                "xyz.pdf" : "cleaned_string_parsed_data_of_xyz.pdf", 
                "abc.docx": "cleaned_string_parsed_data_of_abc.docx", 
                ...
            } 
            name = output.keys()[0]
            data = output[name] 
        """
        try:
            logging.info("In __clean") 
            output = {} 
            for file_name in info.keys():
                logging.info(f"cleaning \'{file_name}\'")
                logging.info(f"cleaning \'{file_name}\'")
                new_line_char = " mmmmmmm " 
                elements_string = info[file_name]
                elements_string = elements_string.replace("\n", new_line_char).replace("|", ",") 
                elements_string = re.sub(r'–', '-', elements_string) 
                for i in punctuation: 
                    elements_string = elements_string.replace(i, " ") 
                elements_string = " ".join(elements_string.split())
                elements_string = elements_string.replace(new_line_char.strip(), "\n") 
                del new_line_char 
                # save file to local 
                path = os.path.join(self.data_ingestion_config.FINAL_DATA_DIR_PATH, file_name)
                save_file(elements_string, path) 
                output[file_name] = elements_string 
                del elements_string 
            logging.info("Out __clean") 
        except Exception as e: 
            logging.error(e) 
            raise CustomException(e, sys) 
        
    def _main(self, files: List[UploadFile]) -> Dict[str, str]: 
        """runs data ingestion components 

        Args:
            files (List[UploadFile]): list object of fastapi.UploadFile / files that have been uploaded

        Returns:
            Dict: 
            key = name of file 

            value = cleaned string object of parsed data  

            example:
            output = __clean(info)
            output = {
                "xyz.pdf" : "cleaned_string_parsed_data_of_xyz.pdf", 
                "abc.docx": "cleaned_string_parsed_data_of_abc.docx", 
                ...
            } 
            name = output.keys()[0]
            data = output[name] 
        """
        # create required dir's 
        create_dirs(self.data_ingestion_config.ROOT_DIR_PATH)
        create_dirs(self.data_ingestion_config.DATA_ROOT_DIR_PATH)
        create_dirs(self.data_ingestion_config.INGESTION_ROOT_DIR_PATH)
        create_dirs(self.data_ingestion_config.RAW_DATA_DIR_PATH)
        create_dirs(self.data_ingestion_config.PARSED_DATA_DIR_PATH)
        create_dirs(self.data_ingestion_config.FINAL_DATA_DIR_PATH)
        # return output 
        return self.__clean(self.__parse(self.__load(files))) 
    
__all__ = ["DataIngestionComponents"]

##### __init__

In [None]:
# from src.ats.components.data_ingestion import *

## pipeline

##### __init__

In [None]:
# from src.ats.components import * 
# from src.ats.config.builder import * 
from dataclasses import dataclass 


@dataclass 
class DataIngestionPipeline: 
    def _run(self, files: List[UploadFile]) -> Dict[str, str]: 
        """runs data ingestion pipeline and returns the output
        """
        self.config = DataIngestionConfig()
        self.components = DataIngestionComponents(self.config) 
        return self.components._main(files) 
    
__all__ = ["DataIngestionPipeline", ]