In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from li

In [42]:
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/diamonds_test.csv')

In [43]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [44]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', StandardScaler())])

In [45]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [46]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [47]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [48]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [51]:
from sklearn.metrics import mean_squared_error

In [52]:
y_test = (diamonds_test[TARGET])
X_test = diamonds_test [FEATS]
y_train = (diamonds_train[TARGET])
X_train = diamonds_train[TARGET]


In [65]:
X_train,X_test,y_train,y_test=train_test_split(diamonds.drop(columns="price"),diamonds["price"],random_state=3000,train_size=0.8)


In [66]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor(n_jobs=-1))])

In [67]:
model.fit(X_train,y_train)
y_predict=model.predict(X_test)
print(f"RMSE={mean_squared_error(y_test,y_predict)**0.5}")

RMSE=559.8495916090152


In [71]:
from sklearn.model_selection import l 

param_grid = {
    'preprocessor__num__imputer__strategy': ['median'],
    'regressor__n_estimators': [150,200,300],
    'regressor__max_depth': [25,50,100],
}

grid_search = GridSearchCV(model, 
                                 param_grid, 
                                 cv=4, 
                                 verbose=1, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 4 folds for each of 9 candidates, totalling 36 fits


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['carat',
                                                                          'depth',
                                                                          'table',
                                                                          'x',
                                                                          'y',
                  

In [72]:
grid_search.best_params_


{'preprocessor__num__imputer__strategy': 'median',
 'regressor__max_depth': 100,
 'regressor__n_estimators': 300}

In [73]:
grid_search.best_score_

-557.3738897753732

RuntimeError: Given feature/column names do not match the ones for the data given during fit.

In [76]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_test})

In [77]:
submission_df


Unnamed: 0,id,price
0,0.0,
1,1.0,
2,2.0,
3,3.0,738.0
4,4.0,
...,...,...
40422,,1114.0
40438,,6570.0
40439,,6623.0
40447,,3349.0
