In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv(r'Escritorio/Orga/train.csv', index_col=0)
test = pd.read_csv(r'Escritorio/Orga/test.csv', index_col=0)
rta = pd.read_csv(r'Escritorio/Orga/ejemploRespuesta.csv')

In [3]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [4]:
#habria que hacer otra cosa con los NaNs
train = train.fillna(0)
test = test.fillna(0)

## Agregando algunas Features

In [5]:
#hay que agregar los features al test set?
train['cant_amenities'] = train['usosmultiples'] + train['piscina'] + train['gimnasio']
test['cant_amenities'] = test['usosmultiples'] + test['piscina'] + test['gimnasio']

In [6]:
train['luminoso'] = train['descripcion'].str.contains('luminoso', regex=False).astype(bool)
test['luminoso'] = test['descripcion'].str.contains('luminoso', regex=False).astype(bool)

In [7]:
encoder = LabelEncoder()

In [8]:
encode_cols = ['tipodepropiedad', 'provincia', 'idzona', 'ciudad']

In [9]:
train[encode_cols] = train[encode_cols].astype(str)
test[encode_cols] = test[encode_cols].astype(str)

In [10]:
encoded_train = train[encode_cols].apply(encoder.fit_transform)
encoded_test = test[encode_cols].apply(encoder.fit_transform)

In [11]:
encoded_train['tipo_encode'] = encoded_train['tipodepropiedad']
encoded_train['prov_encode'] = encoded_train['provincia']
encoded_train['idzona_encode'] = encoded_train['idzona']
encoded_train['ciudad_encode'] = encoded_train['ciudad']
del encoded_train['tipodepropiedad']
del encoded_train['provincia']
del encoded_train['idzona']
del encoded_train['ciudad']
encoded_train

Unnamed: 0_level_0,tipo_encode,prov_encode,idzona_encode,ciudad_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
254099,1,9,4123,94
53461,4,9,4812,371
247984,3,15,9127,768
209067,3,11,10265,862
185997,1,15,8721,852
...,...,...,...,...
119879,3,11,10265,862
259178,3,11,10026,767
131932,1,9,9754,94
146867,3,9,4572,334


In [12]:
encoded_test['tipo_encode'] = encoded_test['tipodepropiedad']
encoded_test['prov_encode'] = encoded_test['provincia']
encoded_test['idzona_encode'] = encoded_test['idzona']
encoded_test['ciudad_encode'] = encoded_test['ciudad']
del encoded_test['tipodepropiedad']
del encoded_test['provincia']
del encoded_test['idzona']
del encoded_test['ciudad']
encoded_test

Unnamed: 0_level_0,tipo_encode,prov_encode,idzona_encode,ciudad_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4941,3,9,0,284
51775,1,31,673,299
115253,1,9,2166,138
299321,1,13,1444,6
173570,3,11,6673,517
...,...,...,...,...
75094,3,9,3355,532
171847,3,11,6787,162
138313,3,19,8905,188
271268,3,22,9859,352


In [13]:
train_final = train.join(encoded_train)
test_final = test.join(encoded_test)

In [18]:
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train_final.drop(['precio'] + drop_cols, axis=1)
y = train_final['precio']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
test_final = test_final.drop(drop_cols, axis=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}  y={y_val.shape}")
print(f"Test shape: {test_final.shape}")

Train shapes: X=(180000, 19) y=(180000,)
Validation shapes: X=(60000, 19)  y=(60000,)
Test shape: (60000, 19)


## KNN 

In [19]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train, y_train) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [20]:
KNN_pred_val = neigh.predict(X_val)

In [21]:
KNN_mae = MAE(y_val, pred)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 749231.93077


In [23]:
KNN_pred_test = neigh.predict(test_final)

In [24]:
res = pd.DataFrame(KNN_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-KNN.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,3771234.0
51775,544833.3
115253,1145408.0
299321,885207.3
173570,662333.3


## XGBoost

In [25]:
#tome los hiperparametros de una pagina cualquiera
best_xgb_model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42,
                 objective ='reg:squarederror')
best_xgb_model.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.07, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=None,
             n_estimators=10000, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0.75,
             reg_lambda=0.45, scale_pos_weight=1, seed=42, silent=None,
             subsample=0.6, verbosity=1)

In [30]:
XGB_pred_val = best_xgb_model.predict(X_val)

In [31]:
XGB_mae = MAE(y_val, XGB_pred_val)
print(f"MAE XGB: {XGB_mae:.5f}")

MAE XGB: 606487.27956


In [32]:
XGB_pred_test = best_xgb_model.predict(test_final)

In [33]:
res = pd.DataFrame(XGB_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-XGB.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6163901.0
51775,1002955.0
115253,2177441.0
299321,1683117.0
173570,843052.2


In [None]:
#a partir de aca no le den bola, copie y pegue cosas del notebook de Navent

## DummyRegressor

In [4]:
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train.drop(['precio'] + drop_cols, axis=1)
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [12]:
dummy = DummyRegressor(strategy='mean').fit(X_train, y_train)
pred = dummy.predict(X_test)

In [13]:
print(f"Promedio de precios del train set: {y_train.mean()}")
print(f"Primeras 3 predicciones: {pred[:3]}")

Promedio de precios del train set: 2536913.142061111
Primeras 3 predicciones: [2536913.14206111 2536913.14206111 2536913.14206111]


In [16]:
dummy_mae = MAE(y_test, pred)
print(f"MAE DummyRegressor: {dummy_mae:.5f}")

MAE DummyRegressor: 1602549.96274


## Regresión Lineal

In [4]:
X = train[['metroscubiertos']]
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

Train shapes: X=(180000, 1) y=(180000,)
Test  shapes: X=(60000, 1)  y=(60000,)


In [12]:
imp = Imputer()
X_train['metroscubiertos'] = imp.fit_transform(X_train[['metroscubiertos']])
X_test['metroscubiertos'] = imp.transform(X_test[['metroscubiertos']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
linear_model = LinearRegression().fit(X_train, y_train)
pred = linear_model.predict(X_test)

In [24]:
linear_mae = MAE(y_test, pred)
print(f"MAE Linear: {linear_mae:.5f}")

MAE Linear: 1191167.10603


In [None]:
-------------------------------------------------------------------------------------------------------------------------

In [50]:
# Imputamos los NaNs
imp = Imputer()
train['metroscubiertos'] = imp.fit_transform(train[['metroscubiertos']])
test['metroscubiertos'] = imp.transform(test[['metroscubiertos']])

linear_pred = LinearRegression()\
                    .fit(train[['metroscubiertos']], train['precio'])\
                    .predict(test[['metroscubiertos']])

res = pd.DataFrame(linear_pred, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-linear.csv", header=True) # RMSLE=0.65487



Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,4200756.0
51775,1112322.0
115253,1377424.0
299321,1364169.0
173570,1284638.0
