In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Admin\\Desktop\\Rohit\\MachineLearning\\ml-mlops-workflow\\research\\Modeling'

In [3]:
os.chdir('../')

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from carbonfootprint.constants import *
from carbonfootprint.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [7]:
import os
from carbonfootprint import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [8]:
df = pd.read_csv("c:\\Users\\Admin\\Desktop\\Rohit\\MachineLearning\\ml-mlops-workflow\\artifacts\\data_ingestion\\data.csv")

In [9]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [10]:
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
for data in categorical_features:
    print(data + ':-')
    print(df[data].unique())
    print(len(df[data].unique()))
    print('-------------------------------------------------------------------------')

Make:-
['ACURA' 'ALFA ROMEO' 'ASTON MARTIN' 'AUDI' 'BENTLEY' 'BMW' 'BUICK'
 'CADILLAC' 'CHEVROLET' 'CHRYSLER' 'DODGE' 'FIAT' 'FORD' 'GMC' 'HONDA'
 'HYUNDAI' 'INFINITI' 'JAGUAR' 'JEEP' 'KIA' 'LAMBORGHINI' 'LAND ROVER'
 'LEXUS' 'LINCOLN' 'MASERATI' 'MAZDA' 'MERCEDES-BENZ' 'MINI' 'MITSUBISHI'
 'NISSAN' 'PORSCHE' 'RAM' 'ROLLS-ROYCE' 'SCION' 'SMART' 'SRT' 'SUBARU'
 'TOYOTA' 'VOLKSWAGEN' 'VOLVO' 'GENESIS' 'BUGATTI']
42
-------------------------------------------------------------------------
Model:-
['ILX' 'ILX HYBRID' 'MDX 4WD' ... 'JETTA TDI CLEAN DIESEL'
 'PASSAT TDI CLEAN DIESEL' 'TOUAREG TDI CLEAN DIESEL']
2075
-------------------------------------------------------------------------
Vehicle Class:-
['COMPACT' 'SUV - SMALL' 'MID-SIZE' 'TWO-SEATER' 'MINICOMPACT'
 'SUBCOMPACT' 'FULL-SIZE' 'STATION WAGON - SMALL' 'SUV - STANDARD'
 'VAN - CARGO' 'VAN - PASSENGER' 'PICKUP TRUCK - STANDARD' 'MINIVAN'
 'SPECIAL PURPOSE VEHICLE' 'STATION WAGON - MID-SIZE'
 'PICKUP TRUCK - SMALL']
16
-----------

In [11]:
from sklearn.preprocessing import OneHotEncoder

cat_features = ['Transmission', 'Fuel Type']
df_new = df.drop(columns=cat_features)

# Apply One-Hot Encoding using str.get_dummies() for each categorical feature
for feature in cat_features:
    dummies = df[feature].str.get_dummies()
    df_new = pd.concat([df_new, dummies.add_prefix(feature + '_')], axis=1)

df_new.shape
df_new.head()

# for feature in categorical_features:
#     df_new['feature'] = df[feature].str.get_dummies()
# df_new.head()

# encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap
# X_train_encoded = encoder.fit_transform(df)
# X_test_encoded = encoder.transform(X_test)

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km),...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,ACURA,ILX,COMPACT,2.0,4,9.9,6.7,8.5,33,196,...,0,0,0,0,0,0,0,0,0,1
1,ACURA,ILX,COMPACT,2.4,4,11.2,7.7,9.6,29,221,...,0,0,0,1,0,0,0,0,0,1
2,ACURA,ILX HYBRID,COMPACT,1.5,4,6.0,5.8,5.9,48,136,...,1,0,0,0,0,0,0,0,0,1
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,12.7,9.1,11.1,25,255,...,0,0,0,0,0,0,0,0,0,1
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,12.1,8.7,10.6,27,244,...,0,0,0,0,0,0,0,0,0,1


In [12]:
columns_to_drop = ['Make', 'Model', 'Vehicle Class']
df_new = df_new.drop(columns=columns_to_drop)
df_new.shape
df_new.head()

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km),Transmission_A10,Transmission_A4,Transmission_A5,...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,2.0,4,9.9,6.7,8.5,33,196,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2.4,4,11.2,7.7,9.6,29,221,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1.5,4,6.0,5.8,5.9,48,136,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,3.5,6,12.7,9.1,11.1,25,255,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3.5,6,12.1,8.7,10.6,27,244,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [27]:
import numpy as np

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up
    def catrgorical_encoding(self):
        cat_features = ['Transmission', 'Fuel Type']
        df_new = df.drop(columns=cat_features)

        # Apply One-Hot Encoding using str.get_dummies() for each categorical feature
        for feature in cat_features:
            dummies = df[feature].str.get_dummies()
            df_new = pd.concat([df_new, dummies.add_prefix(feature + '_')], axis=1)

        columns_to_drop = ['Make', 'Model', 'Vehicle Class']
        df_new = df_new.drop(columns=columns_to_drop)
        df_new.shape


    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(df_new)

        # sc = StandardScaler()
        # train = sc.fit_transform(train)

        # x = np.array(train)
        # train = pd.Series(x)
        # print(train)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)

    
    def standarization(self):
        pass

In [29]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.catrgorical_encoding()
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-06-04 13:33:27,058: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-04 13:33:27,062: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-04 13:33:27,066: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-04 13:33:27,067: INFO: common: created directory at: artifacts]
[2024-06-04 13:33:27,068: INFO: common: created directory at: c:\\Users\\Admin\\Desktop\\Rohit\\MachineLearning\\ml-mlops-workflow\\artifacts\\data_transformation]
[2024-06-04 13:33:27,401: INFO: 879279953: Splited data into training and test sets]
[2024-06-04 13:33:27,402: INFO: 879279953: (6339, 39)]
[2024-06-04 13:33:27,403: INFO: 879279953: (2113, 39)]
(6339, 39)
(2113, 39)
