In [247]:
import pandas as pd
import numpy as np
import time

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import cross_val_score

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

In [248]:
MAXIMAS_ITERACIONES = 50

In [231]:
def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [249]:
train = pd.read_csv("sets_de_datos/train.csv")
test = pd.read_csv("sets_de_datos/test.csv")

### Preparo el dataset

In [233]:
TARGET = 'precio'
columnas = ["tipodepropiedad", "ciudad", "antiguedad", "metroscubiertos",
           "metrostotales", "gimnasio", "usosmultiples", "piscina", "escuelascercanas", "centroscomercialescercanos", "precio"]

In [234]:
train = train[columnas]
train["antiguedad"].fillna(train["antiguedad"].mean(),inplace=True)
train["metroscubiertos"].fillna(train["metroscubiertos"].mean(), inplace=True)
train["metrostotales"].fillna(train["metrostotales"].mean(), inplace=True)
train["precio"].fillna(train["precio"].mean(), inplace=True)
train.dropna(subset=["tipodepropiedad", "ciudad"], inplace=True)
train.fillna(0, inplace=True)

In [235]:
train.isna().sum()

tipodepropiedad               0
ciudad                        0
antiguedad                    0
metroscubiertos               0
metrostotales                 0
gimnasio                      0
usosmultiples                 0
piscina                       0
escuelascercanas              0
centroscomercialescercanos    0
precio                        0
dtype: int64

In [236]:
train_OHE  = pd.get_dummies(train)

In [237]:
X = train_OHE.drop([TARGET], axis = 1).copy().values
y = list(train_OHE[TARGET].copy())

In [238]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0)

### Mejores parámetros con tuneo bayesiano

In [239]:
def objective(hiperparametros):
    rf = RandomForestRegressor(**hiperparametros)
    score = cross_val_score(rf, X_train, y_train, cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

In [240]:
n_estimators  = hp.randint('n_estimators', 500 - 100)
max_depth = hp.randint("max_depth", 60 - 2)
min_samples_split = hp.randint("min_samples_split", 10-2)
min_samples_leaf = hp.randint("min_samples_leaf", 10-2)
max_features = hp.choice("max_features", list(range(1,train.shape[1])))



In [241]:
space = {
    "n_estimators": hp.randint("n_estimators", 500-100) + 100,
    "max_depth": hp.randint("max_depth", 60) + 2,
    "min_samples_split": hp.randint("min_samples_split", 10) + 2,
    "min_samples_leaf": hp.choice("min_samples_leaf", [2, 3, 4, 5, 6, 7, 8, 9, 10])
    "max_features": hp.choice("max_features", list(range(1,train.shape[1])))
}
#    "max_features": hp.choice("max_features", list(range(1,train.shape[1])))
#    "max_depth": hp.choice("max_depth", np.linspace(1, 32, 32, endpoint=True)),
# "max_features": hp.choice("max_features", [1, 5, 'auto', 'sqrt', 'log2']) LINEA QUE ASESINA MIS ESFUERZOS
#    "min_samples_split": hp.choice("min_samples_split", range(2, 10)),
#    "min_samples_leaf": hp.choice("min_samples_leaf", range(2, 10)),
#     "min_samples_leaf": hp.randint("min_samples_leaf", 10) + 2,

In [242]:
trials = Trials()

inicio = time.time()
mejores_parametros = fmin(objective, space = space, algo=tpe.suggest, 
                          max_evals=MAXIMAS_ITERACIONES, trials= trials)
fin = time.time()

print("Tiempo de tuneo: {} minutos".format((fin - inicio) / 60))

100%|██████████| 50/50 [8:33:34<00:00, 1309.03s/it, best loss: -0.4398188914997473]  
Tiempo de tuneo: 513.60208081007 minutos


In [243]:
print("Mejores parámetros con optimización bayesiana:")
print(mejores_parametros)

Mejores parámetros con optimización bayesiana:
{'max_depth': 50, 'max_features': 9, 'min_samples_leaf': 0, 'min_samples_split': 7, 'n_estimators': 6}


### Compruebo el maravilloso score:

In [244]:
inicio = time.time()
regressor = RandomForestRegressor(n_estimators = mejores_parametros["n_estimators"],
                                  max_depth = mejores_parametros["max_depth"],
                                  min_samples_split = mejores_parametros["min_samples_split"],
                                  min_samples_leaf= mejores_parametros["min_samples_leaf"],
                                  max_features = mejores_parametros["max_features"],
                                  random_state=0)
regressor.fit(X_train, y_train)
fin = time.time()
print("Tiempo de entrenamiento: {} minutos".format((fin - inicio) / 60))

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

In [None]:
evaluar_rf(regressor, X_test, y_test)

In [None]:
list(hp.randint("n_estimators", 500-100))

In [254]:
list(train.provincia.unique())

['Distrito Federal',
 'Jalisco',
 'Edo. de México',
 'Oaxaca',
 'Quintana Roo',
 'Colima',
 'San luis Potosí',
 'Nuevo León',
 'Querétaro',
 'Tamaulipas',
 'Puebla',
 'Yucatán',
 'Morelos',
 'Guerrero',
 'Tabasco',
 'Guanajuato',
 'Hidalgo',
 'Veracruz',
 'Chihuahua',
 'Aguascalientes',
 'Sonora',
 'Michoacán',
 'Baja California Norte',
 'Baja California Sur',
 'Coahuila',
 'Durango',
 'Sinaloa',
 'Chiapas',
 'Nayarit',
 'Tlaxcala',
 'Campeche',
 'Zacatecas',
 nan]