In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

# Forming Dataset
np.random.seed(42)
brand = np.random.choice(["Maruti", "Scoda", "BMW", "Range Rover"], 1000)
km_driven = np.random.uniform(10000, 50000, size=1000)
fuel = np.random.choice(["Petrol", "Diesel", "CNG", "LPG"], 1000)
owner = np.random.choice(["First", "Second", "Third", "Fourth"], 1000)
selling_price = np.random.uniform(100000, 500000, size=1000)

df = pd.DataFrame({
    "brand": brand,
    "km_driven": km_driven,
    "fuel": fuel,
    "owner": owner,
    "selling_price": selling_price
})
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,BMW,37926.468561,LPG,Fourth,431007.568843
1,Range Rover,31443.854654,Diesel,Third,405811.117862
2,Maruti,22381.104651,Diesel,Third,329411.580582
3,BMW,42551.800788,LPG,Third,482418.857445
4,BMW,37389.246902,Petrol,Fourth,180189.806211


In [2]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)

## **ColumnTransformer**
Think of Sklearn’s ColumnTransformer as a toolkit that lets you treat different columns in your dataset as independent mini-pipelines. Each column (or group of columns) gets its own preprocessing recipe, and all of these recipes run side-by-side in a single, clean step—no interference, no manual merging of results. You can apply multiple transformations on a single column inside one ColumnTransformer.


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

transformer = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(categories=[["First", "Second", "Third", "Fourth"]]), ['owner']),
        ("onehot", OneHotEncoder(categories=[["Maruti", "Scoda", "BMW", "Range Rover"], ["Petrol", "Diesel", "CNG", "LPG"]], sparse_output=False), ['brand', 'fuel']) # OneHotEncoder returns a sparse matrix by default, we set it to False to get a dense matrix
    ],
    remainder='passthrough', # remainder='passthrough' to keep the columns that are not transformed
    verbose=True, # verbose=True to print the progress
    verbose_feature_names_out=True, # verbose_feature_names_out=True to get the feature names in the output
)

# setting to get a pandas df
transformer.set_output(transform='pandas')

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,True
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['First', 'Second', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Maruti', 'Scoda', ...], ['Petrol', 'Diesel', ...]]"
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [4]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

[ColumnTransformer] ....... (1 of 3) Processing ordinal, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing onehot, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s


In [5]:
X_train_transformed.head()

Unnamed: 0,ordinal__owner,onehot__brand_Maruti,onehot__brand_Scoda,onehot__brand_BMW,onehot__brand_Range Rover,onehot__fuel_Petrol,onehot__fuel_Diesel,onehot__fuel_CNG,onehot__fuel_LPG,remainder__km_driven
29,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,44588.89505
535,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,37851.551035
695,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,44861.472246
557,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,26077.236544
836,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,17981.698037


#### ColumnTransformer Attributes

In [11]:
transformer.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [7]:
transformer.get_feature_names_out()

array(['ordinal__owner', 'onehot__brand_Maruti', 'onehot__brand_Scoda',
       'onehot__brand_BMW', 'onehot__brand_Range Rover',
       'onehot__fuel_Petrol', 'onehot__fuel_Diesel', 'onehot__fuel_CNG',
       'onehot__fuel_LPG', 'remainder__km_driven'], dtype=object)

In [8]:
transformer.n_features_in_

4

In [9]:
transformer.transformers_

[('ordinal',
  OrdinalEncoder(categories=[['First', 'Second', 'Third', 'Fourth']]),
  ['owner']),
 ('onehot',
  OneHotEncoder(categories=[['Maruti', 'Scoda', 'BMW', 'Range Rover'],
                            ['Petrol', 'Diesel', 'CNG', 'LPG']],
                sparse_output=False),
  ['brand', 'fuel']),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  ['km_driven'])]

In [10]:
transformer.output_indices_

{'ordinal': slice(0, 1, None),
 'onehot': slice(1, 9, None),
 'remainder': slice(9, 10, None)}

*Because `ColumnTransformer` applies each transformation independently, you can stack several transformations on the same column; each transformation’s output is added as a distinct set of features in the final dataset.*

In [12]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Sample data
df = pd.DataFrame({
    'age': [25, 30, 35, np.nan, 40],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'city': ['NYC', 'LA', 'NYC', 'Chicago', 'LA']
})

# Apply multiple transformations to the same column
preprocessor = make_column_transformer(
    (StandardScaler(), ['age']),        # First transformation
    (MinMaxScaler(), ['age']),          # Second transformation (same column)
    (SimpleImputer(strategy='median'), ['age']),  # Third transformation
    remainder='passthrough'
)
preprocessor.set_output(transform='pandas')

# This will create multiple columns for 'age'
transformed = preprocessor.fit_transform(df)
print(transformed.shape)  # (5, 5) - original had 3 columns
transformed.head()

(5, 5)


Unnamed: 0,standardscaler__age,minmaxscaler__age,simpleimputer__age,remainder__salary,remainder__city
0,-1.341641,0.0,25.0,50000,NYC
1,-0.447214,0.333333,30.0,60000,LA
2,0.447214,0.666667,35.0,70000,NYC
3,,,32.5,80000,Chicago
4,1.341641,1.0,40.0,90000,LA


## **Pipelines**
A **Pipeline** in scikit-learn is a sequence of data-transforming steps that culminate in an estimator. It lets you treat preprocessing and model training as a single, reusable object: fit it once on training data and then call predict (or score) to have every step executed in order (sequently). Each transformer’s output becomes the next step’s input and the operations that you have mentioned will be applied on an entire dataset.

In [13]:
# Forming Dataset
np.random.seed(42)
brand = np.random.choice(["Maruti", "Scoda", "BMW", "Range Rover"], 1000)
km_driven = np.random.uniform(10000, 50000, size=1000)
fuel = np.random.choice(["Petrol", "Diesel", "CNG", "LPG"], 1000)
owner = np.random.choice(["First", "Second", "Third", "Fourth"], 1000)
selling_price = np.random.uniform(100000, 500000, size=1000)

df = pd.DataFrame({
    "brand": brand,
    "km_driven": km_driven,
    "fuel": fuel,
    "owner": owner,
    "selling_price": selling_price
})
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,BMW,37926.468561,LPG,Fourth,431007.568843
1,Range Rover,31443.854654,Diesel,Third,405811.117862
2,Maruti,22381.104651,Diesel,Third,329411.580582
3,BMW,42551.800788,LPG,Third,482418.857445
4,BMW,37389.246902,Petrol,Fourth,180189.806211


In [None]:
np.random.seed(42)
# Introduce missing values in 'km_driven' column (5% missing values)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan

In [15]:
# Null Values count
df.isnull().sum()

brand             0
km_driven        50
fuel              0
owner            10
selling_price     0
dtype: int64

In [16]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, 29 to 102
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   brand      800 non-null    object 
 1   km_driven  800 non-null    float64
 2   fuel       800 non-null    object 
 3   owner      792 non-null    object 
dtypes: float64(1), object(3)
memory usage: 31.2+ KB


In [38]:
# Importing necessary librabres
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestRegressor

In [39]:
# Imputation transformer
trf1 = ColumnTransformer([
    ('impute_km_driven', SimpleImputer(), [1]), # You can also provide the index of feature instead of feature name
    ('impute_owner', SimpleImputer(strategy='most_frequent'), [3])
], remainder='passthrough')
trf1

0,1,2
,transformers,"[('impute_km_driven', ...), ('impute_owner', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [40]:
# Encoding categorical variables
trf2 = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,2])
    ],
    remainder='passthrough'
)
trf2

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [41]:
# Scaling
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,38)) # slice is an built-in function which slice the subset of values between provided index values
])
trf3

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [42]:
# train the model
trf4 = RandomForestRegressor(n_estimators=100, max_depth=2, verbose=True, oob_score=True)
trf4

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
# Building a Pipeline
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer', trf1),
    ('encoder', trf2),
    ('scaler', trf3),
    ('model', trf4)
], verbose=True)

pipe.fit(X_train, y_train)


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0,1,2
,steps,"[('imputer', ...), ('encoder', ...), ...]"
,transform_input,
,memory,
,verbose,True

0,1,2
,transformers,"[('impute_km_driven', ...), ('impute_owner', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [44]:
pipe.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [45]:
pipe.named_steps

{'imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_km_driven', SimpleImputer(), [1]),
                                 ('impute_owner',
                                  SimpleImputer(strategy='most_frequent'),
                                  [3])]),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  [3]),
                                 ('onehot',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 2])]),
 'scaler': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 38, None))]),
 'model': RandomForestRegressor(max_depth=2, oob_score=True, verbose=True)}

In [46]:
pipe.named_steps['scaler'].transformers_[0][1].data_max_

array([3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [47]:
pipe.predict(X_test)[10:40]

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([299399.32192633, 300240.18216907, 299399.32192633, 299222.8986478 ,
       299399.32192633, 299399.32192633, 301665.45167025, 301665.45167025,
       300240.18216907, 300240.18216907, 301665.45167025, 299222.8986478 ,
       299399.32192633, 300240.18216907, 299399.32192633, 300240.18216907,
       299399.32192633, 301665.45167025, 301665.45167025, 299222.8986478 ,
       300240.18216907, 299222.8986478 , 300240.18216907, 301665.45167025,
       300240.18216907, 299399.32192633, 300240.18216907, 299399.32192633,
       299222.8986478 , 301665.45167025])

In [49]:
# Predict
pipe.predict(pd.DataFrame(np.array(['Maruti',100000.0,'Diesel','First Owner']).reshape(1,4), columns=X_train.columns))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([300240.18216907])

### Cross Validation

In [50]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.3s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


np.float64(-13657878057.832312)

### Hyperparameter Tuning

In [51]:
# gridsearchcv
from sklearn.model_selection import GridSearchCV

params = {
    'model__max_depth':[1,2,3,4,5,None] # Pattern -> `tepname__parametername`
}

grid = GridSearchCV(pipe, params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s
[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s
[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing encoder, total=   0.0s
[Pipeline] ............ (step 3 of 4) Processing scaler, total=   0.0s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[Pipeline] ............. (step 4 of 4) Processing model, total=   0.2s


0,1,2
,estimator,Pipeline(step... verbose=True)
,param_grid,"{'model__max_depth': [1, 2, ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('impute_km_driven', ...), ('impute_owner', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
grid.best_score_

np.float64(-13631976376.99345)

In [53]:
grid.best_params_

{'model__max_depth': 1}

### Export the Pipeline

In [54]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))