In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import datasets
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv(r'Escritorio/Orga/train.csv', index_col=0)
test = pd.read_csv(r'Escritorio/Orga/test.csv', index_col=0)
rta = pd.read_csv(r'Escritorio/Orga/ejemploRespuesta.csv')

In [3]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [4]:
#habria que hacer otra cosa con los NaNs
train = train.fillna(0)
test = test.fillna(0)

## Agregando algunas Features

In [5]:
#hay que agregar los features al test set?
train['cant_amenities'] = train['usosmultiples'] + train['piscina'] + train['gimnasio']
test['cant_amenities'] = test['usosmultiples'] + test['piscina'] + test['gimnasio']

In [6]:
def features_descripcion(lista_features, train, test):
    for i in range(len(lista_features)):
        train[lista_features[i]] = train['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
        test[lista_features[i]] = test['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
    return train, test

In [7]:
lista_features = ['luminoso', 'jardin', 'terraza', 'patio', 'balcón']
train, test = features_descripcion(lista_features, train, test)

In [8]:
train['fecha'] = pd.to_datetime(train['fecha'])
test['fecha'] = pd.to_datetime(test['fecha'])

In [9]:
train['año_y_mes'] = train['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)
test['año_y_mes'] = test['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)

In [10]:
encoder = LabelEncoder()
encode_cols = ['tipodepropiedad', 'provincia', 'ciudad', 'año_y_mes']
train[encode_cols] = train[encode_cols].astype(str)
test[encode_cols] = test[encode_cols].astype(str)

In [11]:
encoded_train = train[encode_cols].apply(encoder.fit_transform)
encoded_test = test[encode_cols].apply(encoder.fit_transform)

In [12]:
#el idzona ya es un encode en si mismo, solo lo paso a string porque lo toma como float
train['idzona'] = train['idzona'].astype(str)
test['idzona'] = test['idzona'].astype(str)

In [13]:
encoded_train['tipo_encode'] = encoded_train['tipodepropiedad'].astype(str)
encoded_train['prov_encode'] = encoded_train['provincia'].astype(str)
encoded_train['ciudad_encode'] = encoded_train['ciudad'].astype(str)
encoded_train['año_y_mes_encode'] = encoded_train['año_y_mes'].astype(str)
del encoded_train['tipodepropiedad']
del encoded_train['provincia']
del encoded_train['ciudad']
del encoded_train['año_y_mes']
encoded_train

Unnamed: 0_level_0,tipo_encode,prov_encode,ciudad_encode,año_y_mes_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
254099,1,9,94,43
53461,4,9,371,17
247984,3,15,768,45
209067,3,11,862,2
185997,1,15,852,53
...,...,...,...,...
119879,3,11,862,37
259178,3,11,767,30
131932,1,9,94,38
146867,3,9,334,35


In [14]:
encoded_test['tipo_encode'] = encoded_test['tipodepropiedad'].astype(str)
encoded_test['prov_encode'] = encoded_test['provincia'].astype(str)
encoded_test['ciudad_encode'] = encoded_test['ciudad'].astype(str)
encoded_test['año_y_mes_encode'] = encoded_test['año_y_mes'].astype(str)
del encoded_test['tipodepropiedad']
del encoded_test['provincia']
del encoded_test['ciudad']
del encoded_test['año_y_mes']
encoded_test

Unnamed: 0_level_0,tipo_encode,prov_encode,ciudad_encode,año_y_mes_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4941,3,9,284,18
51775,1,31,299,45
115253,1,9,138,40
299321,1,13,6,39
173570,3,11,517,19
...,...,...,...,...
75094,3,9,532,44
171847,3,11,162,57
138313,3,19,188,35
271268,3,22,352,57


In [15]:
encoder = OneHotEncoder()
encode_cols = ['prov_encode', 'tipo_encode', 'ciudad_encode', 'año_y_mes_encode']
onehot_train = encoder.fit_transform(encoded_train[encode_cols])
onehot_test = encoder.fit_transform(encoded_train[encode_cols])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
onehot_train = onehot_train.toarray()
onehot_test = onehot_test.toarray()

In [18]:
onehot_train = pd.DataFrame(onehot_train)
onehot_test = pd.DataFrame(onehot_test)

In [20]:
train_final = train.join(onehot_train)
test_final = test.join(onehot_test)

In [21]:
train_final

Unnamed: 0_level_0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,centroscomercialescercanos,precio,cant_amenities,luminoso,jardin,terraza,patio,balcón,año_y_mes,0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,0.0,2.0,1.0,2.0,...,0.0,2273000.0,0.0,False,False,False,False,False,201508,
53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,2.0,...,1.0,3600000.0,0.0,False,False,True,False,False,201306,"(0, 13)\t1.0\n (0, 34)\t1.0\n (0, 65)\t1.0..."
247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.0,3.0,2.0,2.0,...,0.0,1200000.0,0.0,False,True,False,True,False,201510,
209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.0,2.0,1.0,1.0,...,1.0,650000.0,0.0,False,False,False,False,False,201203,"(0, 23)\t1.0\n (0, 34)\t1.0\n (0, 174)\t1...."
185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.0,2.0,1.0,1.0,...,0.0,1150000.0,0.0,False,False,False,False,False,201606,"(0, 9)\t1.0\n (0, 34)\t1.0\n (0, 476)\t1.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,bonita casas de 2 recamaras a 10 minutos del c...,vendo casa en bosques de ica residencial a 10 ...,Casa,BOSQUES,Zinacantepec,Edo. de México,0.0,2.0,2.0,1.0,...,0.0,650000.0,0.0,False,False,False,True,False,201502,"(0, 23)\t1.0\n (0, 36)\t1.0\n (0, 174)\t1...."
259178,casa en condominio a 10 min. del centro de toluca,"casa con un jardin amplio, un cuarto de servic...",Casa,Filiberto Navas 325,Toluca,Edo. de México,0.0,3.0,3.0,3.0,...,1.0,1940000.0,0.0,False,True,False,False,False,201407,
131932,nicolas san juan,"departamento con excelente ubicación, muy cerc...",Apartamento,Nicolas San Juan,Benito Juárez,Distrito Federal,20.0,2.0,1.0,2.0,...,0.0,3400000.0,0.0,False,False,False,False,False,201503,"(0, 15)\t1.0\n (0, 36)\t1.0\n (0, 572)\t1...."
146867,casa sola. javier rojo gomez.,"casa sola, dividida en cuatro departamentos de...",Casa,Javier Rojo Gomez 120,Iztapalapa,Distrito Federal,20.0,4.0,0.0,4.0,...,1.0,2890000.0,1.0,False,False,False,False,False,201412,"(0, 11)\t1.0\n (0, 36)\t1.0\n (0, 740)\t1...."


In [16]:
drop_cols = ['fecha', 'ciudad', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train_final.drop(['precio'] + drop_cols, axis=1)
y = train_final['precio']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
test_final = test_final.drop(drop_cols, axis=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}  y={y_val.shape}")
print(f"Test shape: {test_final.shape}")

Train shapes: X=(180000, 24) y=(180000,)
Validation shapes: X=(60000, 24)  y=(60000,)
Test shape: (60000, 24)


## KNN 

In [19]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train, y_train) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [20]:
KNN_pred_val = neigh.predict(X_val)

In [21]:
KNN_mae = MAE(y_val, KNN_pred_val)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 734047.37565


In [23]:
KNN_pred_test = neigh.predict(test_final)

In [24]:
res = pd.DataFrame(KNN_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-KNN.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,3771234.0
51775,544833.3
115253,1145408.0
299321,885207.3
173570,662333.3


## XGBoost

In [21]:
#tome los hiperparametros de una pagina cualquiera
best_xgb_model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
best_xgb_model.fit(X_train,y_train)

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields idzona, año_y_mes, tipo_encode, prov_encode, ciudad_encode

In [20]:
XGB_pred_val = best_xgb_model.predict(X_val)

In [21]:
XGB_mae = MAE(y_val, XGB_pred_val)
print(f"MAE XGB: {XGB_mae:.5f}")

MAE XGB: 604941.97929


In [32]:
XGB_pred_test = best_xgb_model.predict(test_final)

In [33]:
res = pd.DataFrame(XGB_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-XGB.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6163901.0
51775,1002955.0
115253,2177441.0
299321,1683117.0
173570,843052.2


In [None]:
#a partir de aca no le den bola, copie y pegue cosas del notebook de Navent

## DummyRegressor

In [4]:
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train.drop(['precio'] + drop_cols, axis=1)
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [12]:
dummy = DummyRegressor(strategy='mean').fit(X_train, y_train)
pred = dummy.predict(X_test)

In [13]:
print(f"Promedio de precios del train set: {y_train.mean()}")
print(f"Primeras 3 predicciones: {pred[:3]}")

Promedio de precios del train set: 2536913.142061111
Primeras 3 predicciones: [2536913.14206111 2536913.14206111 2536913.14206111]


In [16]:
dummy_mae = MAE(y_test, pred)
print(f"MAE DummyRegressor: {dummy_mae:.5f}")

MAE DummyRegressor: 1602549.96274


## Regresión Lineal

In [4]:
X = train[['metroscubiertos']]
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

Train shapes: X=(180000, 1) y=(180000,)
Test  shapes: X=(60000, 1)  y=(60000,)


In [12]:
imp = Imputer()
X_train['metroscubiertos'] = imp.fit_transform(X_train[['metroscubiertos']])
X_test['metroscubiertos'] = imp.transform(X_test[['metroscubiertos']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
linear_model = LinearRegression().fit(X_train, y_train)
pred = linear_model.predict(X_test)

In [24]:
linear_mae = MAE(y_test, pred)
print(f"MAE Linear: {linear_mae:.5f}")

MAE Linear: 1191167.10603


In [None]:
-------------------------------------------------------------------------------------------------------------------------

In [50]:
# Imputamos los NaNs
imp = Imputer()
train['metroscubiertos'] = imp.fit_transform(train[['metroscubiertos']])
test['metroscubiertos'] = imp.transform(test[['metroscubiertos']])

linear_pred = LinearRegression()\
                    .fit(train[['metroscubiertos']], train['precio'])\
                    .predict(test[['metroscubiertos']])

res = pd.DataFrame(linear_pred, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-linear.csv", header=True) # RMSLE=0.65487



Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,4200756.0
51775,1112322.0
115253,1377424.0
299321,1364169.0
173570,1284638.0
