In [1]:
import os
os.chdir("../")

In [22]:
%pwd

'c:\\Users\\frup00090410\\Mlops_project'

In [23]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    unpreprocessed_data_path: Path
    column_text: str
    column_topic: str
    preprocessed_data_path: Path


In [24]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [33]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        
        create_directories([config.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=Path(config.root_dir),
            unpreprocessed_data_path=Path(config.unpreprocessed_data_path),
            column_text=config.column_text,
            column_topic=config.column_topic,
            preprocessed_data_path=Path(config.preprocessed_data_path)
        )

        return data_preprocessing_config

In [48]:
import os
import re
import pandas as pd
import numpy as np
import json
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

In [76]:
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig): 
        self.config = config
    
    def detect_last_file(self) -> Path:
        """
        get last unzip files from the ingestion pipeline
        """
        directory = self.config.unpreprocessed_data_path
        json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
        max_element = None
        max_output = float('-inf')

        for element in json_files:
            output = os.path.getctime(os.path.join(os.getcwd(), directory,element))
            if output > max_output:
                max_output = output
                max_element = element
        return os.path.join(os.getcwd(), directory,max_element)

    def read_data(self, file_path: Path) -> pd.DataFrame:
        """
        read and subset the data
        """
        column_text = self.config.column_text
        column_topic = self.config.column_topic
        # Load the JSON file as a string
        with open(file_path) as f:
            data = json.load(f)
        # Normalize the JSON data and create a DataFrame
        df = pd.json_normalize(data)
        # Subset of data
        df = df.loc[:, ['text', 'Topic']]
        return df
        
    
    def clean_data(self, df: pd.DataFrame):
        '''This function 
            - Clean column name
            - Drop NA's
            - makes the given text lowercase
            - removes text in square brackets
            - removes punctuation and 
            - removes words containing numbers.
        :param text: text to be cleaned
        :return: cleaned text
        '''
        column_text = self.config.column_text
        preprocessed_data_path = self.config.preprocessed_data_path

        os.makedirs("artifacts/data_preprocessing", exist_ok=True)


        # Assign nan in place of blanks in the complaints column
        df[column_text].replace("", np.nan, inplace=True)
        #Remove all rows where complaints column is nan
        df.dropna(subset=[column_text], inplace=True)
        # Make the text lowercase
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: x.lower()))
        # Remove text in square brackets
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: re.sub(r'\[.*?\]', '', x)))    
        # Remove punctuation
        df[column_text] = pd.DataFrame(
                            df[column_text].apply(lambda x: re.sub(r'[%s]' % re.escape(string.punctuation), '', x))
                            )    
        # Remove words containing numbers
        df[column_text] = pd.DataFrame(df[column_text].apply(lambda x: re.sub(r'\w*\d\w*', '', x)))
        
        try:
            logger.info(f"downloading cleaned data into file {preprocessed_data_path}")
            json_data = df.to_json(orient='records')
            with open('cleaned_data.json', 'w') as f:
                f.write(json_data)
            logger.info(f"downloaded cleaned data into file {preprocessed_data_path}")
        except Exception as e:
            raise e
        


In [34]:
config = ConfigurationManager()

[2023-12-20 16:32:03,890: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-20 16:32:03,893: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-20 16:32:03,895: INFO: common: created directory at: artifacts]


In [35]:
data_ingestion_config = config.get_data_preprocessing_config()

[2023-12-20 16:32:06,775: INFO: common: created directory at: artifacts/data_preprocessing]


In [36]:
data_ingestion = DataPreprocessing(config=data_ingestion_config)

In [77]:
data_ingestion.detect_last_file()

FileNotFoundError: [WinError 2] Le fichier spécifié est introuvable: 'complaints-2021-05-14_08_16_.json'

In [None]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

In [54]:
os.chdir('c:\\Users\\frup00090410\\Mlops_project')

In [70]:
directory = 'artifacts/data_ingestion'
json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
max_element = None
max_output = float('-inf')

for element in json_files:
    output = os.path.getctime(os.path.join(os.getcwd(), directory,element))
    if output > max_output:
        max_output = output
        max_element = element




In [71]:
json_files

['complaints-2021-05-14_08_16_.json', 'data.json']

In [73]:
max_element = None
max_output = float('-inf')

for element in json_files:
    output = os.path.getctime(os.path.join(os.getcwd(), directory,element))
    if output > max_output:
        max_output = output
        max_element = element

In [75]:
max_element

'data.json'

In [72]:
for file in json_files:
    time_stamp

TypeError: 'float' object is not callable

In [None]:
import os

directory = '/path/to/directory'

if os.path.exists(directory):
    print('Directory exists')
else:
    print('Directory does not exist')

In [59]:
import os

In [68]:
os.path.join(os.getcwd(), directory, 'complaints-2021-05-14_08_16_.json')

'c:\\Users\\frup00090410\\Mlops_project\\artifacts/data_ingestion\\complaints-2021-05-14_08_16_.json'

In [69]:
os.path.getctime(os.path.join(os.getcwd(), directory))

1702914938.9359896