In [1]:
%pwd

'e:\\FullStack_Data\\MACHINE_LEARNING\\PROJECTS\\MLPROJECT_part_prediction\\research'

In [2]:
import os
os.chdir('../')

In [3]:
%pwd

'e:\\FullStack_Data\\MACHINE_LEARNING\\PROJECTS\\MLPROJECT_part_prediction'

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformConfig:
    root_dir : Path
    data_dir : Path


In [None]:
from src.ML_Part_predict.constants import *
from src.ML_Part_predict.utils.common import read_yaml,create_directories
class ConfigurationManager:
    def __init__(self,config_file_p=CONFIG_FILE_PATH,params_file_p=CONFIG_FILE_PATH,schema_file_p=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_file_p)
        self.params=read_yaml(params_file_p)
        self.schema=read_yaml(schema_file_p)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])

        data_transform_config=DataTransformConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir
        )
        return data_transform_config
        

In [None]:
from src.ML_Part_predict import logger
import pandas as pd
class FeatureEngineering:
    def __init__(self,config: DataTransformConfig):
        logger.info("Feature Engineering Started")
        self.config=config

    def Random_Sample_imputation(data,feature):
        random_sample=data[feature].dropna().sample(data[feature].isnull().sum())               
        random_sample.index=data[data[feature].isnull()].index
        data.loc[data[feature].isnull(),feature]=random_sample

    def handling_outliers(datas,columns):
        for i in columns:
            q1=datas[i].quantile(0.25)
            q3=datas[i].quantile(0.75)
            IQR=q3-q1
            upper_lim=q3+1.5*IQR
            lower_lim=q1-1.5*IQR
            datas[i]=datas[i].apply(lambda x:upper_lim if x>upper_lim else lower_lim if x<lower_lim else x)
        
        return datas

    def feature_E(self):
        try:
            df=pd.read_csv(self.config.data_dir)
            logger.info(f"the columns are {df.columns} and shape is : {df.shape}")

            df.drop_duplicates(inplace=True)
            logger.info(f"duplicated data number: {df.duplicated().sum()}. Duplicates deletion completed")

            df['VehicleYear']=df['VehicleYear'].astype('str')
            logger.info(f"vehicleYear dtype changed to object")
            df['ClaimYear']=df['ClaimDate'].str.split('-').str[0]
            logger.info(f"feature ClaimYear extracted from ClaimDate")
            df['MaintenanceFrequency']=df['MaintenanceFrequency'].str.replace('hours','').astype('Int64')
            logger.info(f"in feature MaintenanceFrequency hours is removed and dtype changed to Int64")
            df['PreviousFailures']=df['PreviousFailures'].astype('Int64')
            logger.info(f"in feature PreviousFailures dtype changed to Int64")

            Random_Sample_imputation(data=df,feature=[i for i in df])
            logger.info(f"replacing null values with random imputation completed...")

            handling_outliers(datas=df,columns=['HoursOfOperation','SettlementAmount','MaintenanceFrequency','PreviousFailures'])
            logger.info(f"Outlier handling completed...")

            df['WarrantyStatus'].replace({'Out of Warranty':'out_of_warranty','In Warranty':'in_warranty'},inplace=True)
            logger.info(f"warranty status values renamed as {df['WarrantyStatus'].unique()}")
            
            df=pd.get_dummies(df, columns=['VehicleModel','VehicleYear','PartName','SupplierName','EnvironmentCondition','OperationalIntensity','WarrantyStatus','ClaimYear'], drop_first=True)
            df['pass_fail'].replace({'pass': 1, 'fail': 0}, inplace=True)    
            logger.info(f"implementation of pd.dummies completed... ")

            df.drop(['ClaimDate'],axis=1,inplace=True)
            logger.info(f"features dropped are : ClaimDate ")

            df.to_csv(os.path.join(self.config.root_dir,"FE_data.csv"),index=False)
            logger.info(f"Feature Engineering Completed and file saved Successfully")
    
        except Exception as e:
            raise e


In [None]:
import os
from src.ML_Part_predict import logger
from sklearn.model_selection import train_test_split
import pandas as pd

class DataTransformation:
    def __init__(self,config: DataTransformConfig):
        logger.info(f"Data Transformation initiated")
        self.config=config

    def train_test_split(self):
        try:
            logger.info("reading the Feature Engineered csv file")
            df=pd.read_csv(self.config.root_dir.FE_data.csv)
            logger.info("reading the Feature Engineered csv file completed...")

            train,test=train_test_split(df)
            logger.info("Splitting of train test done...")

            train.to_csv(os.path.join(self.config.root_dir,"train.csv"),index=False)
            test.to_csv(os.path.join(self.config.root_dir,"test.csv"),index=False)

            logger.info(train.shape)
            logger.info(test.shape)
        
        except Exception as e:
            raise e
        

In [None]:
try:
    config=ConfigurationManager()
    getdatatransform=config.get_data_transformation_config()
    featureEng=FeatureEngineering(config=getdatatransform)
    featureEng.feature_E()
    D_transform=DataTransformation(config=getdatatransform)
    D_transform.train_test_split()
except Exception as e:
    raise e