In [1]:
import os

os.chdir('..')
%pwd

'e:\\DataScienceProjects\\car-price-prediction'

In [2]:
# Entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    interim_dataset_dir: Path
    dataset_dir: Path
    label_encoder_dir: Path
    tensors_dim_dir: Path

In [3]:
# Configuration Manager
from carPricePrediction.constants import *
from carPricePrediction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([
            config.root_dir, 
            config.dataset_dir,
            config.label_encoder_dir,
            config.tensors_dim_dir
        ])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            interim_dataset_dir = config.interim_dataset_dir,
            dataset_dir = config.dataset_dir,
            label_encoder_dir = config.label_encoder_dir,
            tensors_dim_dir = config.tensors_dim_dir            
        )

        return data_transformation_config

In [4]:
import joblib
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch
from carPricePrediction.logging import logger
from torch.utils.data import TensorDataset, random_split

In [5]:
class DataTransfomration:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def transform_data(self):
        ### Read interim data
        file = os.listdir(self.config.interim_dataset_dir)
        interim_data = os.path.join(self.config.interim_dataset_dir, file[0])
        df = pd.read_csv(interim_data)

        # Get features type
        target_feature = 'Price'
        num_features = df.drop(['Price'], axis=1).select_dtypes(include='number').columns
        cat_features = df.select_dtypes(include='object').columns

        # Label encoding for categorical features
        lbl_encoders={}
        for feature in cat_features:
            lbl_encoders[feature] = LabelEncoder()
            df[feature] = lbl_encoders[feature].fit_transform(df[feature])

        # Convert categorical data into tensor
        cat_arr = df[cat_features].to_numpy()
        cat_tensor = torch.tensor(cat_arr, dtype=torch.int64)

        # Convert numerical data into tensor
        num_arr = df[num_features].to_numpy()
        num_tensor = torch.tensor(num_arr, dtype=torch.float)

        # Convert target into tensor
        target_arr = df[target_feature].to_numpy()
        y_tensor = torch.tensor(target_arr, dtype=torch.float).reshape(-1,1)

        # Create the embedding size for categorical features
        cat_dims = [len(df[feature].unique()) for feature in cat_features]
        # Rule of thumb for embedding dim (by fastai)
        embedding_dim = [(x, min(50, (x+1)//2)) for x in cat_dims]

        ### Save Label Encoders
        encoder_file_path = os.path.join(self.config.label_encoder_dir, f'LE_dict.pkl')
        joblib.dump(lbl_encoders, encoder_file_path)

        ### Save numerical dimensions  
        num_dim_file_path = os.path.join(self.config.tensors_dim_dir, 'num_dim.pkl')
        with open(num_dim_file_path, 'wb') as f:
            pickle.dump(num_tensor.shape[1], f)

        ### Save embedding dimensions 
        emb_dim_file_path = os.path.join(self.config.tensors_dim_dir, 'embedding_dim.pkl')
        with open(emb_dim_file_path, 'wb') as f:
            pickle.dump(embedding_dim, f)

        ### Save dataset
        dataset = TensorDataset(cat_tensor, num_tensor, y_tensor)
        train_data, val_data, test_data = self.get_random_split(dataset)

        train_file_path = os.path.join(self.config.dataset_dir,"train.pth")
        val_file_path = os.path.join(self.config.dataset_dir,"val.pth")
        test_file_path = os.path.join(self.config.dataset_dir,"test.pth")
        torch.save(train_data, train_file_path)
        torch.save(val_data, val_file_path)
        torch.save(test_data, test_file_path)

        logger.info("Processed datasets saved successfully.")


    def get_random_split(self, dataset):
        train_size = int(0.7 * len(dataset))  # 70% for training
        val_size = int(0.15 * len(dataset))   # 15% for validation
        test_size = len(dataset) - train_size - val_size 
        train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])
        return train_data, val_data, test_data

In [6]:
# Data transformation training pipeline
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransfomration(config=data_transformation_config)
    data_transformation.transform_data()
except Exception as e:
    raise e

[2024-03-30 19:14:32,534: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2024-03-30 19:14:32,536: INFO: common: yaml file: params.yaml loaded successfully.]
[2024-03-30 19:14:32,537: INFO: common: Created directory at: artifacts]
[2024-03-30 19:14:32,538: INFO: common: Created directory at: artifacts]
[2024-03-30 19:14:32,540: INFO: common: Created directory at: artifacts/data/processed]
[2024-03-30 19:14:32,541: INFO: common: Created directory at: artifacts/encoders]
[2024-03-30 19:14:32,541: INFO: common: Created directory at: artifacts/tensors_dim]
[2024-03-30 19:14:32,637: INFO: 1067956073: Processed datasets saved successfully.]
