In [1]:
import pandas as pd
import numpy as np
from common import metrica
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [11]:
df_train = pd.read_csv("sets_de_datos/train.csv", index_col = 0)
df_test = pd.read_csv("sets_de_datos/test.csv", index_col = 0)

In [12]:
col_borrar = ["direccion", "lat", "lng", "titulo", "descripcion", "idzona", "fecha"]
df_train.drop(col_borrar, axis=1)
df_test.drop(col_borrar, axis=1)

Unnamed: 0_level_0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4941,Casa,Miguel Hidalgo,Distrito Federal,29.0,3.0,,4.0,300.0,,0.0,0.0,0.0,0.0,0.0
51775,Apartamento,Mérida,Yucatán,,1.0,1.0,1.0,67.0,67.0,0.0,0.0,0.0,0.0,0.0
115253,Apartamento,Coyoacán,Distrito Federal,0.0,2.0,1.0,2.0,87.0,100.0,0.0,0.0,0.0,0.0,1.0
299321,Apartamento,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,2.0,86.0,86.0,0.0,0.0,0.0,0.0,0.0
173570,Casa,Tultitlán,Edo. de México,10.0,2.0,1.0,1.0,80.0,76.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75094,Casa,Venustiano Carranza,Distrito Federal,20.0,4.0,3.0,3.0,291.0,,0.0,0.0,0.0,1.0,1.0
171847,Casa,Ecatepec de Morelos,Edo. de México,10.0,3.0,1.0,2.0,71.0,87.0,0.0,0.0,0.0,1.0,1.0
138313,Casa,Guadalupe,Nuevo León,5.0,3.0,2.0,2.0,102.0,,0.0,0.0,0.0,1.0,1.0
271268,Casa,Querétaro,Querétaro,0.0,2.0,1.0,2.0,130.0,144.0,0.0,0.0,0.0,1.0,1.0


In [13]:
df_train = df_train.fillna(value = {'tipodepropiedad' : 'nan', 'provincia' : 'nan', 'ciudad' : 'nan'})
df_test = df_test.fillna(value = {'tipodepropiedad' : 'nan', 'provincia' : 'nan', 'ciudad' : 'nan'})

In [14]:
datos = df_train.drop(["precio"], axis=1)
precios = df_train["precio"]

In [15]:
columnas_pipe = ColumnTransformer(transformers = [
    ('nan_to_mean', SimpleImputer(strategy = 'mean'), ['metrostotales', 'metroscubiertos', 'antiguedad']),
    ('nan_to_cero', SimpleImputer(strategy = 'constant', fill_value = 0), ['habitaciones', 'banos', 'garages'])
])

pre_processor_pipe = Pipeline(steps =[
    ('ordinal_encoder', TargetEncoder(cols = ["ciudad", "provincia", "tipodepropiedad"])),
    ('columns_pipe', columnas_pipe)
])

#Datos sacados del tuneo anterior

xgb_model = XGBRegressor(
    objective = 'reg:squarederror',
    subsample = 0.6,
    n_estimators = 1000,
    min_child_weight = 3,
    max_depth = 9,
    learning_rate = 0.01,
    reg_lambda = 0.25,
    gamma = 0.75,
    colsample_bytree = 0.6
)

ppal_pipe = Pipeline(steps = [
    ('preprocessing', pre_processor_pipe), 
    ('xgb_regressor', xgb_model)
])

In [16]:
pipe = ppal_pipe.fit(datos, precios)

In [17]:
prediccion = pipe.predict(df_test)

In [18]:
df_pred = pd.DataFrame(prediccion, index=df_test.index, columns=['target'])
df_pred.to_csv("submits/submit_A_xgboost_1.csv", header=True)