In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\frup00090410\\Mlops_project'

In [15]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    unpreprocessed_data_path: Path
    column_text: str
    column_topic: str
    preprocessed_data_path: Path


In [16]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning
        
        create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir=Path(config.root_dir),
            unpreprocessed_data_path=Path(config.unpreprocessed_data_path),
            column_text=config.column_text,
            column_topic=config.column_topic,
            preprocessed_data_path=Path(config.preprocessed_data_path)
        )

        return data_cleaning_config

In [18]:
import os
import re
import pandas as pd
import numpy as np
import string
import json
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

In [19]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig): 
        self.config = config
    
    def detect_last_file(self) -> Path:
        """
        get last unzip files from the ingestion pipeline
        """
        directory = self.config.unpreprocessed_data_path
        json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
        max_element = None
        max_output = float('-inf')

        for element in json_files:
            output = os.path.getctime(os.path.join(os.getcwd(), directory,element))
            if output > max_output:
                max_output = output
                max_element = element
        return os.path.join(os.getcwd(), directory,max_element)

    def read_data(self, file_path: Path) -> pd.DataFrame:
        """
        read and subset the data
        """
        column_text = self.config.column_text
        column_topic = self.config.column_topic
        # Load the JSON file as a string
        with open(file_path) as f:
            data = json.load(f)
        # Normalize the JSON data and create a DataFrame
        df = pd.json_normalize(data)
        # Subset of data
        df = df.loc[:, ['text', 'Topic']]
        return df
        
    
    def clean_data(self, df: pd.DataFrame):
        '''This function 
            - Clean column name
            - Drop NA's
            - makes the given text lowercase
            - removes text in square brackets
            - removes punctuation and 
            - removes words containing numbers.
        :param text: text to be cleaned
        :return: cleaned text
        '''
        column_text = self.config.column_text
        preprocessed_data_path = self.config.preprocessed_data_path

        os.makedirs("artifacts/data_cleaning", exist_ok=True)


        # Assign nan in place of blanks in the complaints column
        df[column_text].replace("", np.nan, inplace=True)
        #Remove all rows where complaints column is nan
        df.dropna(subset=[column_text], inplace=True)
        # Make the text lowercase
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: x.lower()))
        # Remove text in square brackets
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: re.sub(r'\[.*?\]', '', x)))    
        # Remove punctuation
        df[column_text] = pd.DataFrame(
                            df[column_text].apply(lambda x: re.sub(r'[%s]' % re.escape(string.punctuation), '', x))
                            )    
        # Remove words containing numbers
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: re.sub(r'\w*\d\w*', '', x)))
        
        try:
            logger.info(f"downloading cleaned data into file {preprocessed_data_path}")
            json_data = df.to_json(orient='records')
            with open(os.path.join(preprocessed_data_path,'cleaned_data.json'), 'w') as f:
                f.write(json_data)
            logger.info(f"downloaded cleaned data into file {preprocessed_data_path}")
        except Exception as e:
            raise e
        


In [20]:
try:
    config = ConfigurationManager()
    data_cleaning_config = config.get_data_cleaning_config()
    data_cleaning = DataCleaning(config=data_cleaning_config)
    data_cleaning.clean_data(data_cleaning.read_data(data_cleaning.detect_last_file()))
except Exception as e:
    raise e

[2023-12-21 11:28:16,250: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-21 11:28:16,254: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-21 11:28:16,256: INFO: common: created directory at: artifacts]
[2023-12-21 11:28:16,258: INFO: common: created directory at: artifacts/data_cleaning]
[2023-12-21 11:28:22,722: INFO: 1806934824: downloading cleaned data into file artifacts\data_cleaning]
[2023-12-21 11:28:23,084: INFO: 1806934824: downloaded cleaned data into file artifacts\data_cleaning]
