In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
sys.path.append('/content/drive/Mi unidad/Colab Notebooks')

In [0]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [0]:
TARGET = "precio"
MAX_ITER = 10

### Funciones auxiliares

In [0]:
def limpiar(df):
    df.antiguedad = df.antiguedad.fillna(df.antiguedad.mean())
    df.metroscubiertos = df.metroscubiertos.fillna(df.metroscubiertos.mean())
    df.habitaciones = df.habitaciones.fillna(df.habitaciones.mean())
    df.garages = df.garages.fillna(0)
    df.banos = df.banos.fillna(1)
    df.tipodepropiedad = df.tipodepropiedad.fillna('Casa')
    df.metroscubiertos = df.metroscubiertos.fillna(df.metroscubiertos.mean())
    df.metrostotales = df.metrostotales.fillna(df.metrostotales.mean())
    df.gimnasio = df.gimnasio.fillna(0)
    df.usosmultiples = df.usosmultiples.fillna(0)
    df.piscina = df.piscina.fillna(0)
    df.escuelascercanas = df.escuelascercanas.fillna(0)
    df.centroscomercialescercanos = df.centroscomercialescercanos.fillna(0)
    df["metroscubiertos"] = df["metroscubiertos"].fillna(df["metroscubiertos"].mean())
    df.fillna(value = {"tipodepropiedad" : df["tipodepropiedad"].mode().to_string(),
                        "provincia" : df["provincia"].mode().to_string(),
                        "ciudad": df["ciudad"].mode().to_string()}, inplace = True)

# Preparo el dataset

## Carga de datos

In [0]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv', index_col = 0)
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv', index_col = 0)


In [0]:
train["fecha"] = pd.to_datetime(train["fecha"], format="%Y-%m-%d %H:%M:%S", errors='coerce')
test["fecha"] = pd.to_datetime(test["fecha"], format="%Y-%m-%d %H:%M:%S", errors='coerce')

## Limpio las columnas que voy a usar

In [0]:
train["provincia"].replace(["", np.nan], [train["provincia"].mode(), train["provincia"].mode()], inplace=True)
test["provincia"].replace(["", np.nan], [test["provincia"].mode(), test["provincia"].mode()], inplace=True)

In [0]:
limpiar(train)
limpiar(test)

In [0]:
test['tipodepropiedad'] = test['tipodepropiedad'].str.replace('0    Casa',"Casa")
train["tipodepropiedad"] = train[train["tipodepropiedad"].isin(test["tipodepropiedad"].unique())]
train.dropna(subset=["tipodepropiedad"], inplace=True)

## Cargo features adicionales

In [0]:
train = feature.agregar_feature_pobreza_porcentual(train)
test = feature.agregar_feature_pobreza_porcentual(test)

In [0]:
train.columns

In [0]:
train = feature.agregar_feature_pbi(train)
test = feature.agregar_feature_pbi(test)

In [0]:
train.columns

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train.columns

In [0]:
test = feature_num.completar_lat_lng_con_provincias_y_ciudades(test)
test = feature_num.completar_lat_lng_con_idzona_mean(test)
feature_num.completar_lat_lng_con_promedio_Mexico(test)

In [0]:
train.head()

In [0]:
train['ratio_cubierto'] = train.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)
test['ratio_cubierto'] = test.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)

## Me quedo con algunas features

In [0]:
columnas_train = ["tipodepropiedad", "antiguedad", "lat", "lng", "gimnasio", "usosmultiples", "piscina", "pbi_campo",
                  "pbi_mineria", "pbi_energia_agua_gas", "pbi_construccion", "pbi_industrias_manufactureras",
                  "pbi_comercio", "ratio_cubierto", "precio"]
columnas_test = ["tipodepropiedad", "antiguedad", "lat", "lng", "gimnasio", "usosmultiples", "piscina", "pbi_campo",
                  "pbi_mineria", "pbi_energia_agua_gas", "pbi_construccion", "pbi_industrias_manufactureras",
                  "pbi_comercio", "ratio_cubierto"]

In [0]:
train = train[columnas_train]
test = test[columnas_test]

In [0]:
train_OHE  = pd.get_dummies(train)

In [0]:
test_OHE = pd.get_dummies(test)

# Tuneo

### Divido el train

In [0]:
X = train_OHE.drop([TARGET], axis = 1).copy().values
y = list(train_OHE[TARGET].copy())

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0)

### Grilla de parápametros

In [0]:
hiperparametros = {
    "n_estimators": [100, 200, 250 ,300, 350 ,400, 500],
    "max_depth": [10, 20, 30, 40, 50, 60],
    "max_features": [1, 5, 'auto', 'sqrt', 'log2'],
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10],
}

### Tuneo

In [0]:
rf = RandomForestRegressor()

inicio = time.time()


rf_random = RandomizedSearchCV(estimator = rf, param_distributions = hiperparametros, n_iter = MAX_ITER, cv = 3, verbose=2, random_state=42, n_jobs = -1)

fin = time.time()

rf_random.fit(X_train, y_train)

In [0]:
print("El tuneo tardó: {} minutos.".format((fin - inicio) / 60))