In [6]:
import os

In [7]:
%pwd

'e:\\FullStack_Data\\MACHINE_LEARNING\\PROJECTS\\Detailed_Project\\Detailed_MLFLOW\\research'

In [8]:
os.chdir('../')

In [9]:
%pwd

'e:\\FullStack_Data\\MACHINE_LEARNING\\PROJECTS\\Detailed_Project\\Detailed_MLFLOW'

In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Data_Transformation_Config:
    root_dir: Path
    data_transformation_input: Path

In [11]:
from src.Detailed_MLFLow_project.constants import *
from src.Detailed_MLFLow_project.utils.common import create_directories,read_yaml
class ConfigurationManager:
    def __init__(self,config_read=CONFIG_FILE_PATH,schema_read=SCHEMA_FILE_PATH,params_read=PARAMS_FILE_PATH ):
        self.config=read_yaml(config_read),
        self.schema=read_yaml(schema_read),
        self.params=read_yaml(params_read)

        create_directories([self.config.artifacts_root])

    def data_transformed_configuration(self)->Data_Transformation_Config:
        config=self.config.data_transformation

        create_directories([config.root_dir])
    
        get_data_transformed_config=Data_Transformation_Config(
            root_dir=config.root_dir,
            data_transformation_input=config.unzip_data

        )

        return get_data_transformed_config



In [12]:
import pandas as pd
from src import logger
from sklearn.model_selection import train_test_split

class Feature_Engineering:
    def __init__(self,config:Data_Transformation_Config):
        self.config=config

    def Random_Sample_imputation(self,data,feature):
        random_sample=data[feature].dropna().sample(data[feature].isnull().sum())               
        random_sample.index=data[data[feature].isnull()].index
        data.loc[data[feature].isnull(),feature]=random_sample

    def handling_outliers(self,datas,columns):
        for i in columns:
            q1=datas[i].quantile(0.25)
            q3=datas[i].quantile(0.75)
            IQR=q3-q1
            upper_lim=q3+1.5*IQR
            lower_lim=q1-1.5*IQR
            datas[i]=datas[i].apply(lambda x:upper_lim if x>upper_lim else lower_lim if x<lower_lim else x)
        
        return datas

    def feature_E(self):
        try:
            df=pd.read_csv(self.config.data_dir)
            logger.info(f"the columns are {df.columns} and shape is : {df.shape}")

            df.drop_duplicates(inplace=True)
            logger.info(f"duplicated data number: {df.duplicated().sum()}. Duplicates deletion completed")

            df['VehicleYear']=df['VehicleYear'].astype('str')
            logger.info(f"vehicleYear dtype changed to object")
            df['ClaimYear']=df['ClaimDate'].str.split('-').str[0]
            logger.info(f"feature ClaimYear extracted from ClaimDate")
            df['MaintenanceFrequency']=df['MaintenanceFrequency'].str.replace('hours','').astype('Int64')
            logger.info(f"in feature MaintenanceFrequency hours is removed and dtype changed to Int64")
            df['PreviousFailures']=df['PreviousFailures'].astype('Int64')
            logger.info(f"in feature PreviousFailures dtype changed to Int64")

            for col in df.columns:
                self.Random_Sample_imputation(data=df, feature=col)
            logger.info(f"replacing null values with random imputation completed...")

            self.handling_outliers(datas=df,columns=['HoursOfOperation','SettlementAmount','MaintenanceFrequency','PreviousFailures'])
            logger.info(f"Outlier handling completed...")

            df['WarrantyStatus'].replace({'Out of Warranty':'out_of_warranty','In Warranty':'in_warranty'},inplace=True)
            logger.info(f"warranty status values renamed as {df['WarrantyStatus'].unique()}")
            
            df=pd.get_dummies(df, columns=['VehicleModel','VehicleYear','PartName','SupplierName','EnvironmentCondition','OperationalIntensity','WarrantyStatus','ClaimYear'], drop_first=True)
            df['pass_fail'].replace({'pass': 1, 'fail': 0}, inplace=True)    
            logger.info(f"implementation of pd.dummies completed... ")

            df.drop(['ClaimDate'],axis=1,inplace=True)
            logger.info(f"features dropped are : ClaimDate ")

            save_path = Path("artifacts/data_transformation/FE_data.csv")
            logger.info(f"Saving file to {save_path}")
            df.to_csv(save_path, index=False)

            logger.info(f"Feature Engineering Completed and file saved Successfully")
    
        except Exception as e:
            raise e
        

class DataTransformation:
    def __init__(self,config: Data_Transformation_Config):
        self.config=config

    def train_test_split(self):
        try:
            
            df=pd.read_csv("artifacts/data_transformation/FE_data.csv")
            logger.info("data imported succcessfully")

            train,test=train_test_split(df)
            logger.info("train test split completed")

            train.to_csv(os.path.join(self.config.root_dir,"train.csv"),index=False)
            test.to_csv(os.path.join(self.config.root_dir,"test.csv"),index=False)
            logger.info("train and test file saved completed")

            logger.info(f"train shape: {train.shape} and test shape : {test.shape} ")

        except Exception as e:
            raise e
        

In [13]:
try:
    conf=ConfigurationManager()
    con=conf.data_transformed_configuration()  
    fe=Feature_Engineering(con)
    fe1=fe.feature_E()
    dt=DataTransformation(con)
    dt.train_test_split()
except Exception as e:
    raise e

[2024-03-29 16:50:12,345: INFO: common: yaml file {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/hrishikesh147/MLProject/raw/main/part_namesz.zip', 'local_data_file': 'artifacts/data_ingestion/zipped_data.zip', 'unzip_data': 'artifacts/data_ingestion/data'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'status_file': 'artifacts/data_validation/status.txt', 'unzip_data_dir': 'artifacts/data_ingestion/data/part_names.csv'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_transformation_input': 'artifacts/data_ingestion/data'}} loaded successfully]
[2024-03-29 16:50:12,380: INFO: common: yaml file {'COLUMNS': {'VehicleModel': 'object', 'VehicleYear': 'int64', 'PartName': 'object', 'HoursOfOperation': 'float64', 'SupplierName': 'object', 'ClaimDate': 'object', 'SettlementAmount': 'float64', 'MaintenanceFrequency': 'object', 'EnvironmentCondition': 'object', 'OperationalI

AttributeError: 'tuple' object has no attribute 'artifacts_root'