In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
diamonds= pd.read_csv('./diamonds_train.csv.zip')
diamonds_predict = pd.read_csv('./diamonds_test.csv')

In [3]:
diamonds.head().T

Unnamed: 0,0,1,2,3,4
carat,1.21,0.32,0.71,0.41,1.02
cut,Premium,Very Good,Fair,Good,Ideal
color,J,H,G,D,G
clarity,VS2,VS2,VS1,SI1,SI1
depth,62.4,63,65.5,63.8,60.5
table,58,57,55,56,59
price,4268,505,2686,738,4882
x,6.83,4.35,5.62,4.68,6.55
y,6.79,4.38,5.53,4.72,6.51
z,4.25,2.75,3.65,3,3.95


In [4]:
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATURES = ['cut', 'color', 'clarity']
TARGET = 'price'
FEATS = NUM_FEATURES + CAT_FEATURES

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
imputer = SimpleImputer(strategy= 'mean')
scaler = StandardScaler()
scaler.fit_transform(diamonds[NUM_FEATURES])

array([[ 0.8670056 ,  0.45201864,  0.24798091,  0.97880679,  0.92198533,
         1.02265738],
       [-1.00455749,  0.8710986 , -0.19974534, -1.22673789, -1.17981558,
        -1.1292594 ],
       [-0.18443434,  2.61726508, -1.09519783, -0.09728557, -0.17688154,
         0.16189067],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158,  0.56971383,  0.5993022 ,
         0.6783507 ],
       [-0.98352869,  0.10278535, -1.4086062 , -1.13780463, -1.10132509,
        -1.11491329],
       [ 0.93009199,  0.172632  ,  0.24798091,  0.97880679,  1.00047582,
         1.02265738]])

In [7]:
diamonds[NUM_FEATURES]

Unnamed: 0,carat,depth,table,x,y,z
0,1.21,62.4,58.0,6.83,6.79,4.25
1,0.32,63.0,57.0,4.35,4.38,2.75
2,0.71,65.5,55.0,5.62,5.53,3.65
3,0.41,63.8,56.0,4.68,4.72,3.00
4,1.02,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...
40450,1.34,62.7,57.0,7.10,7.04,4.43
40451,2.02,57.1,60.0,8.31,8.25,4.73
40452,1.01,62.7,56.0,6.37,6.42,4.01
40453,0.33,61.9,54.3,4.45,4.47,2.76


In [8]:
numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'mean')),
                                          ('scaler', StandardScaler())])

In [9]:
numerical_transformer.fit_transform(diamonds[NUM_FEATURES])

array([[ 0.8670056 ,  0.45201864,  0.24798091,  0.97880679,  0.92198533,
         1.02265738],
       [-1.00455749,  0.8710986 , -0.19974534, -1.22673789, -1.17981558,
        -1.1292594 ],
       [-0.18443434,  2.61726508, -1.09519783, -0.09728557, -0.17688154,
         0.16189067],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158,  0.56971383,  0.5993022 ,
         0.6783507 ],
       [-0.98352869,  0.10278535, -1.4086062 , -1.13780463, -1.10132509,
        -1.11491329],
       [ 0.93009199,  0.172632  ,  0.24798091,  0.97880679,  1.00047582,
         1.02265738]])

In [10]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy= 'constant',
                                                                   fill_value = 'missing')),
                                          ('encoder', OrdinalEncoder(handle_unknown = 'ignore'))])

In [11]:
preprocessor= ColumnTransformer(transformers = [('numerical_preprocessor', numerical_transformer, NUM_FEATURES),
                                               ('categorical_preprocessor', categorical_transformer, CAT_FEATURES)])

In [12]:
preprocessor.fit_transform(diamonds[FEATS])

array([[ 0.8670056 ,  0.45201864,  0.24798091, ...,  3.        ,
         6.        ,  5.        ],
       [-1.00455749,  0.8710986 , -0.19974534, ...,  4.        ,
         4.        ,  5.        ],
       [-0.18443434,  2.61726508, -1.09519783, ...,  0.        ,
         3.        ,  4.        ],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158, ...,  2.        ,
         4.        ,  2.        ],
       [-0.98352869,  0.10278535, -1.4086062 , ...,  2.        ,
         6.        ,  4.        ],
       [ 0.93009199,  0.172632  ,  0.24798091, ...,  2.        ,
         5.        ,  2.        ]])

In [13]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds[FEATS]))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,3.0,6.0,5.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,4.0,4.0,5.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,0.0,3.0,4.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,1.0,0.0,2.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,2.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...
40450,1.140380,0.661559,-0.199745,1.218927,1.140014,1.280887,2.0,3.0,4.0
40451,2.570338,-3.249854,1.143433,2.295019,2.195276,1.711271,1.0,2.0,3.0
40452,0.446430,0.661559,-0.647472,0.569714,0.599302,0.678351,2.0,4.0,2.0
40453,-0.983529,0.102785,-1.408606,-1.137805,-1.101325,-1.114913,2.0,6.0,4.0


In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [31]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [72]:
model = Pipeline(steps = [('preprocessor', preprocessor),
                         ('regressor', RandomForestRegressor(n_jobs = -1))])

In [73]:
X_train = diamonds_train[FEATS]
y_train = diamonds_train[TARGET]

In [74]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('categorical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                      

In [75]:
y_train_predict = model.predict(X_train)

In [76]:
y_train.values

array([2531, 3449, 3710, ..., 4140, 4685, 3307])

In [77]:
y_train_predict

array([2469.2 , 3266.82, 3592.03, ..., 4181.82, 4545.76, 3332.78])

In [78]:
from sklearn.metrics import mean_squared_error

In [79]:
mean_squared_error(y_true = y_train, y_pred= y_train_predict, squared = False)

211.01703128277472

In [80]:
X_test = diamonds_test[FEATS]
y_test = diamonds_test[TARGET]

In [81]:
y_test_predict = model.predict(X_test)

In [82]:
mean_squared_error(y_true = y_test, y_pred = y_test_predict, squared = False)

578.4041645751585

In [83]:
from sklearn.model_selection import cross_val_score

In [84]:
X = diamonds[FEATS]
y = diamonds[TARGET]

In [85]:
cross_val_score(model, X, y, scoring = 'neg_root_mean_squared_error', cv = 4, n_jobs= -1).mean()

-558.9586618964257

In [86]:
y_submission = model.predict(diamonds_predict[FEATS])
pd.DataFrame({'price': y_submission, 'id': diamonds_predict.id}).to_csv('jaimevazquez.csv',index = False)

In [87]:
# hiperparámetros: El resto de cosas que definen al algoritmo pero que el modelo no aprende por si solo en el entrenamiento. Cosas por encima del .fit.
# tunning de hiperparámetros: usando from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV


In [88]:
model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('categorical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                      

In [89]:
parameter_grid = {
    'regressor__n__estimators': [64, 128, 256, 512]
}

In [90]:
?RandomForestRegressor

[0;31mInit signature:[0m
[0mRandomForestRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'mse'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_split[0m[0;34m=[0m[0;32mNone[0m[0;34m

In [91]:
grid_search = RandomizedSearchCV(model,
                                 parameter_grid,
                                 cv = 5,
                                 verbose= 10,
                                 scoring = 'neg_root_mean_squared_error',
                                 n_jobs= -1,
                                 n_iter= 4)

In [69]:
grid_search.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


ValueError: Invalid parameter n for estimator RandomForestRegressor(n_jobs=-1). Check the list of available parameters with `estimator.get_params().keys()`.

In [70]:
estimator.get_params().keys()

NameError: name 'estimator' is not defined