In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import feature_string as feature_string
import feature_num as feature_num
import feature_categorica as feature_categorica
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import time
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constates

In [0]:
TARGET = 'precio'
K = 2
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
N_ITER = 100

## Train

In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

## Features

In [0]:
def limpiar(df):
    df['garages'] = df['garages'].fillna(0)
    df['metroscubiertos'] = df['metroscubiertos'].fillna(df.metroscubiertos.mean())
    df['antiguedad'] = df['antiguedad'].fillna(df.antiguedad.mean())
    df['banos'] = df['banos'].fillna(1)
    df['habitaciones'] = df['habitaciones'].fillna(df.habitaciones.mean())
    df['tipodepropiedad'] = df['tipodepropiedad'].fillna('Casa')
    df['metrostotales'] = df['metrostotales'].fillna(0)
    df['metroscubiertos'] = df['metroscubiertos'].fillna(df['metroscubiertos'].mean())
    df['ciudad'] = df['ciudad'].fillna("")

def nuevas_features(df, precios_tipo,precio_m2,promedios,default_m2):
    df['ratio_cubierto'] = df.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)
    df['tipodepropiedad'] = df['tipodepropiedad'].apply(lambda x: precios_tipo.loc[x]['precio_por_tipo'])
    df['precio_x_m2'] = df.apply(lambda x: precio_x_m2.get(x['ciudad'],default_m2), axis = 1)
    df['mean_hab'] = df.apply(lambda x: promedios['mean_hab'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_ban'] = df.apply(lambda x: promedios['mean_ban'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_gar'] = df.apply(lambda x: promedios['mean_gar'].get(x['ciudad'],default_m2), axis = 1)

In [0]:
train_precios_por_tipo = train.groupby('tipodepropiedad').agg({'precio':'mean'}).rename(columns={'precio':'precio_por_tipo'})
limpiar(train)

In [0]:
train['precio_x_m2'] = train['precio']/train['metroscubiertos']
precio_x_m2 = train.groupby('ciudad').agg({'precio_x_m2':'mean'}).to_dict()['precio_x_m2']
default = train.groupby('ciudad').agg({'precio_x_m2':'mean'})['precio_x_m2'].mean()
promedios = train.set_index('ciudad')\
            .join(train.groupby('ciudad')\
                  .agg({'habitaciones':'mean', 'garages':'mean', 'banos':'mean'})\
                      .rename(columns={'habitaciones':'mean_hab', 'banos':'mean_ban', 'garages':'mean_gar'}))\
                        [['mean_hab','mean_gar','mean_ban']].to_dict()
nuevas_features(train, train_precios_por_tipo, precio_x_m2, promedios, default)

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train['extras'] =\
  train['gimnasio'].astype(int)\
  + train['usosmultiples'].astype(int)\
  + train['piscina'].astype(int)\
  + train['escuelascercanas'].astype(int)\
  + train['centroscomercialescercanos'].astype(int)

In [0]:
train.rename(columns = {'tipodepropiedad' : 'tipodepropiedad_precio_mean'}, inplace = True)

In [0]:
feature_num.agregar_feature_fecha_numerica(train)

In [57]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad_precio_mean',
       'direccion', 'ciudad', 'provincia', 'antiguedad', 'habitaciones',
       'garages', 'banos', 'metroscubiertos', 'metrostotales', 'idzona', 'lat',
       'lng', 'fecha', 'gimnasio', 'usosmultiples', 'piscina',
       'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'precio_x_m2', 'ratio_cubierto', 'mean_hab', 'mean_ban', 'mean_gar',
       'extras', 'fecha_numerica'],
      dtype='object')

In [0]:
drop_columns = [
                'provincia', 
                'ciudad', 
                'idzona', 
                'gimnasio', 
                'usosmultiples', 
                'piscina', 
                'escuelascercanas', 
                'centroscomercialescercanos',
                'titulo',
                'descripcion',
                'metrostotales',
                'fecha'
                ]

In [0]:
FEATURES = [
            'tipodepropiedad_precio_mean', 
            'lat', 
            'lng', 
            'garages', 
            'habitaciones', 
            'antiguedad', 
            'metroscubiertos', 
            'banos', 
            'ratio_cubierto', 
            'mean_hab', 
            'mean_ban', 
            'mean_gar', 
            'precio_x_m2',
            'extras',
            'fecha_numerica', 
            'precio'
            ]

In [0]:
train.set_index('id', inplace = True)

In [0]:
train = train[FEATURES]

In [0]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

## Objective, Space & Pipeline 

In [0]:
def objective(hyper_parametros):
    
    xgb_regressor = XGBRegressor(objective = 'reg:squarederror',**hyper_parametros) 
    
    score = cross_val_score(xgb_regressor, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [0]:
space = {
    "n_estimators": hp.randint("xgb_re__n_estimators", 1000) + 1, 
    "learning_rate": hp.uniform("xgb_re__learning_rate", 0.005, 0.03), 
    "gamma": hp.uniform("xgb_re__gamma", 0.1, 5),
    "lambda": hp.uniform("xgb_re__lambda",0.1, 5),
    "max_depth": hp.randint("xgb_re__max_depth", 11) + 1, 
    "subsample": 0.79,
    "colsample_bytree": 0.8,
    "min_child_weight": hp.randint("xgb_re__min_child_weight", 5) + 1
}

## Tunning

In [65]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -581466.562 params {'colsample_bytree': 0.8, 'gamma': 4.991384041220235, 'lambda': 2.3053488456461437, 'learning_rate': 0.013917354749031049, 'max_depth': 10, 'min_child_weight': 4, 'n_estimators': 880, 'subsample': 0.79}
SCORE: -990885.812 params {'colsample_bytree': 0.8, 'gamma': 0.679655589302391, 'lambda': 1.5635764360515823, 'learning_rate': 0.020685145367205598, 'max_depth': 1, 'min_child_weight': 2, 'n_estimators': 170, 'subsample': 0.79}
SCORE: -758637.625 params {'colsample_bytree': 0.8, 'gamma': 2.6760581122045637, 'lambda': 1.7317162826806134, 'learning_rate': 0.00742089401573268, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 266, 'subsample': 0.79}
SCORE: -603516.125 params {'colsample_bytree': 0.8, 'gamma': 3.271953783129848, 'lambda': 4.668213524549889, 'learning_rate': 0.017293825701377726, 'max_depth': 11, 'min_child_weight': 5, 'n_estimators': 188, 'subsample': 0.79}
SCORE: -755407.438 params {'colsample_bytree': 0.8, 'gamma': 0.7978473376448728, 'lambd

KeyboardInterrupt: ignored

SCORE: -581466.562   
params {  
  'colsample_bytree': 0.8,  
  'gamma': 4.991384041220235,  
  'lambda': 2.3053488456461437,  
  'learning_rate': 0.013917354749031049,  
  'max_depth': 10,  
  'min_child_weight': 4,  
  'n_estimators': 880,  
  'subsample': 0.79  
  }


In [0]:
print(best)