In [45]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

import numpy as np
np.random.seed(42)

data=pd.read_csv('car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'],inplace=True)

cat_features=['Make','Colour']
cat_trans=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))])

door_feature=['Doors']
door_trans=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value=4))
])
num_features=['Odometer (KM)']
num_trans=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))
])

In [46]:
preproc=ColumnTransformer(
    transformers=[
        ('cat',cat_trans,cat_features),
        ('door',door_trans,door_feature),
        ('num',num_trans,num_features)
    ])

In [47]:
model=Pipeline(steps=[('preprocessing',preproc),
                      ('model',RandomForestRegressor())])

In [48]:
x=data.drop('Price',axis=1)
y=data['Price']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)

In [49]:
model.fit(xtrain,ytrain)
model.score(xtest,ytest)

0.22188417408787875

In [55]:
pipe_grid={
    'preprocessing__num__imputer__strategy':['mean','median'],
    'model__n_estimators':[100,1000],
    'model__max_depth':[None,5],
    'model__max_features':['sqrt'],
    'model__min_samples_split':[2,4]
}

In [61]:
from sklearn.model_selection import GridSearchCV
gsmodel=GridSearchCV(model,pipe_grid,cv=5,verbose=5)
gsmodel.fit(xtrain,ytrain)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean;, score=0.126 total time=   0.1s
[CV 2/5] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean;, score=0.024 total time=   0.0s
[CV 3/5] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean;, score=0.281 total time=   0.0s
[CV 4/5] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean;, score=0.228 total time=   0.0s
[CV 5/5] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean;, score=0.099 tot

KeyboardInterrupt: 

In [57]:
gsmodel.score(xtest,ytest)

0.294792289642396

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Load the dataset
data = pd.read_csv('car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'], inplace=True)

# Define feature sets
cat_features = ['Make', 'Colour']
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

door_feature = ['Doors']
door_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

num_features = ['Odometer (KM)']
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Define the preprocessor (ColumnTransformer) with all transformations
preproc = ColumnTransformer(
    transformers=[
        ('cat', cat_trans, cat_features),
        ('door', door_trans, door_feature),
        ('num', num_trans, num_features)
    ]
)

# Define the main model pipeline with the preprocessor and RandomForestRegressor
model = Pipeline(steps=[('preprocessing', preproc),
                        ('model', RandomForestRegressor())])

# Split the data into features (X) and target (y)
X = data.drop('Price', axis=1)
y = data['Price']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize GridSearchCV with the pipeline and the parameter grid
pipe_grid = {
    'preprocessing__num__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['sqrt'],
    'model__min_samples_split': [2, 4]
}

# Perform grid search
gsmodel = GridSearchCV(model, pipe_grid, cv=5, verbose=2)

# Fit the grid search with the training data
gsmodel.fit(X_train, y_train)

# After fitting, access the best parameters and the test score
print("Best parameters found: ", gsmodel.best_params_)
print("Test score: ", gsmodel.score(X_test, y_test))
