In [1]:
import os

In [2]:
%pwd

'c:\\Users\\frup00090410\\Mlops_project\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    cleaned_data_path: Path
    preprocessed_spilitted_data_path: Path
    max_words: float
    topic_names: dict
    test_size: float

In [5]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories, write_to_pickle

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        
        create_directories([config.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=Path(config.root_dir),
            cleaned_data_path=Path(config.cleaned_data_path),
            preprocessed_spilitted_data_path=Path(config.preprocessed_spilitted_data_path),
            max_words=self.params.MAX_WORDS,
            test_size=self.params.TEST_SIZE,
            topic_names=self.params.TOPIC_NAMES
        )

        return data_preprocessing_config

In [7]:
import os
import pandas as pd
import numpy as np
import string
import json
from cnnClassifier import logger
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from collections import Counter
import pickle


In [8]:
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig): 
        self.config = config
    
    def detect_last_file(self) -> Path:
        """
        get last unzip files from the ingestion pipeline
        """
        logger.info("Looking for all JSON cleaned files to Select latest one created")
        directory = self.config.cleaned_data_path
        json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
        max_element = None
        max_output = float('-inf')

        for element in json_files:
            output = os.path.getctime(os.path.join(os.getcwd(), directory,element))
            if output > max_output:
                max_output = output
                max_element = element
        logger.info(f"latest cleaned file detected: {max_element}")
        return os.path.join(os.getcwd(), directory,max_element)
    

    def read_data(self, file_path: Path) -> pd.DataFrame:
        """
        read and subset the data
        """
        logger.info("Started reading cleaned data file")
        # Load the JSON file as a string
        with open(file_path) as f:
            data = json.load(f)
        # Normalize the JSON data and create a DataFrame
        df = pd.json_normalize(data)
        # Subset of data
        logger.info("File read is completed")
        return df
    
    
    def vector_classes_info_to_json(self, vector_class, output_file_name: str, output_path: Path):
        """_summary_

        Args:
            vector_class (_type_): _description_
        """
        # count the number of each class
        class_counts = Counter(vector_class)
        
        # calculate the total number of classes
        total_classes = sum(class_counts.values())
        
        # calculate the percentage of each class
        class_percentages = {cls: count / total_classes * 100 for cls, count in class_counts.items()}
        
        # create a dictionary with the counts and percentages
        class_info = {
            'counts': class_counts,
            'percentages': class_percentages
                    }

        # write the dictionary to a JSON file
        with open(os.path.join(output_path,f"{output_file_name}.json"), 'w') as f:
            json.dump(class_info, f)


    def preprocess_data(self, df: pd.DataFrame):
        """Split data and apply preprocessing

        Args:
            df (pd.DataFrame): _description_
        """
        df['Topic'] = df['Topic'].map(self.config.topic_names)

        X_train, X_test, y_train, y_test = train_test_split(
            df.text, df.Topic, test_size=0.25, random_state=42
            )
        
        self.vector_classes_info_to_json(y_train,'metadata_info_train',self.config.preprocessed_spilitted_data_path)
        self.vector_classes_info_to_json(y_test,'metadata_info_test',self.config.preprocessed_spilitted_data_path)
        
        tokenizer = Tokenizer(num_words=self.config.max_words)
        tokenizer.fit_on_texts(X_train)

        # Save trained Tokenizer 
        with open(os.path.join(self.config.preprocessed_spilitted_data_path,'tokenizer.pickle'), 'wb') as handle:
            pickle.dump(Tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        # Convert the text data into sequences of integers
        X_train_sequences = tokenizer.texts_to_sequences(X_train)
        X_test_sequences = tokenizer.texts_to_sequences(X_test)
        # # Transform the word vector to tf-idf
        x_train_tfidf = tokenizer.sequences_to_matrix(X_train_sequences, mode="tfidf")
        X_test_tfidf = tokenizer.sequences_to_matrix(X_test_sequences, mode="tfidf")

        # how to store this data : pickle
        write_to_pickle(x_train_tfidf, 'X_train_preprocessed',self.config.preprocessed_spilitted_data_path)
        write_to_pickle(X_test_tfidf, 'X_test_preprocessed', self.config.preprocessed_spilitted_data_path)

        write_to_pickle(y_train, 'y_train_preprocessed', self.config.preprocessed_spilitted_data_path)
        write_to_pickle(y_test, 'y_test_preprocessed', self.config.preprocessed_spilitted_data_path)
        

        



In [9]:
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
    data_preprocessing.preprocess_data(data_preprocessing.read_data(data_preprocessing.detect_last_file()))
except Exception as e:
    raise e

[2023-12-22 12:01:37,026: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-22 12:01:37,030: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-22 12:01:37,033: INFO: common: created directory at: artifacts]
[2023-12-22 12:01:37,037: INFO: common: created directory at: artifacts/data_preprocessing]
