In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import datasets
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv(r'Escritorio/Orga/train.csv', index_col=0)
test = pd.read_csv(r'Escritorio/Orga/test.csv', index_col=0)
rta = pd.read_csv(r'Escritorio/Orga/ejemploRespuesta.csv')

In [3]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [4]:
#habria que hacer otra cosa con los NaNs
train = train.fillna(0)
test = test.fillna(0)

## Agregando algunas Features

In [5]:
#hay que agregar los features al test set?
train['cant_amenities'] = train['usosmultiples'] + train['piscina'] + train['gimnasio']
test['cant_amenities'] = test['usosmultiples'] + test['piscina'] + test['gimnasio']

In [6]:
def features_descripcion(lista_features, train, test):
    for i in range(len(lista_features)):
        train[lista_features[i]] = train['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
        test[lista_features[i]] = test['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
    return train, test

In [7]:
lista_features = ['luminoso', 'jardin', 'terraza', 'patio', 'balcón']
train, test = features_descripcion(lista_features, train, test)

In [8]:
train['fecha'] = pd.to_datetime(train['fecha'])
test['fecha'] = pd.to_datetime(test['fecha'])

In [9]:
train['año_y_mes'] = train['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)
test['año_y_mes'] = test['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)

In [10]:
#el idzona ya es un encode en si mismo, solo lo paso a string porque lo toma como float
train['idzona'] = train['idzona'].astype(str)
test['idzona'] = test['idzona'].astype(str)

In [11]:
encoder = LabelEncoder()
encode_cols = ['tipodepropiedad', 'provincia', 'ciudad']
train[encode_cols] = train[encode_cols].astype(str)
test[encode_cols] = test[encode_cols].astype(str)

In [12]:
encoded_train = train[encode_cols].apply(encoder.fit_transform)
encoded_test = test[encode_cols].apply(encoder.fit_transform)

In [13]:
encoded_train['tipo_encode'] = encoded_train['tipodepropiedad'].astype(str)
encoded_train['prov_encode'] = encoded_train['provincia'].astype(str)
encoded_train['ciudad_encode'] = encoded_train['ciudad'].astype(str)
#encoded_train['año_y_mes_encode'] = encoded_train['año_y_mes'].astype(str)
del encoded_train['tipodepropiedad']
del encoded_train['provincia']
del encoded_train['ciudad']
#del encoded_train['año_y_mes']
encoded_train

Unnamed: 0_level_0,tipo_encode,prov_encode,ciudad_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
254099,1,9,94
53461,4,9,371
247984,3,15,768
209067,3,11,862
185997,1,15,852
...,...,...,...
119879,3,11,862
259178,3,11,767
131932,1,9,94
146867,3,9,334


In [14]:
encoded_test['tipo_encode'] = encoded_test['tipodepropiedad'].astype(str)
encoded_test['prov_encode'] = encoded_test['provincia'].astype(str)
encoded_test['ciudad_encode'] = encoded_test['ciudad'].astype(str)
#encoded_test['año_y_mes_encode'] = encoded_test['año_y_mes'].astype(str)
del encoded_test['tipodepropiedad']
del encoded_test['provincia']
del encoded_test['ciudad']
#del encoded_test['año_y_mes']
encoded_test

Unnamed: 0_level_0,tipo_encode,prov_encode,ciudad_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4941,3,9,284
51775,1,31,299
115253,1,9,138
299321,1,13,6
173570,3,11,517
...,...,...,...
75094,3,9,532
171847,3,11,162
138313,3,19,188
271268,3,22,352


In [15]:
train_final = train.join(encoded_train)
test_final = test.join(encoded_test)

In [16]:
drop_cols = ['fecha', 'ciudad', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train_final.drop(['precio'] + drop_cols, axis=1)
y = train_final['precio']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
test_final = test_final.drop(drop_cols, axis=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}  y={y_val.shape}")
print(f"Test shape: {test_final.shape}")

Train shapes: X=(180000, 24) y=(180000,)
Validation shapes: X=(60000, 24)  y=(60000,)
Test shape: (60000, 24)


In [18]:
X_train.dtypes

antiguedad                    float64
habitaciones                  float64
garages                       float64
banos                         float64
metroscubiertos               float64
metrostotales                 float64
idzona                         object
lat                           float64
lng                           float64
gimnasio                      float64
usosmultiples                 float64
piscina                       float64
escuelascercanas              float64
centroscomercialescercanos    float64
cant_amenities                float64
luminoso                         bool
jardin                           bool
terraza                          bool
patio                            bool
balcón                           bool
año_y_mes                      object
tipo_encode                    object
prov_encode                    object
ciudad_encode                  object
dtype: object

## KNN 

In [19]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train, y_train) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [20]:
KNN_pred_val = neigh.predict(X_val)

In [21]:
KNN_mae = MAE(y_val, KNN_pred_val)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 734047.37565


In [23]:
KNN_pred_test = neigh.predict(test_final)

In [24]:
res = pd.DataFrame(KNN_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-KNN.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,3771234.0
51775,544833.3
115253,1145408.0
299321,885207.3
173570,662333.3


## XGBoost

In [None]:
#tome los hiperparametros de una pagina cualquiera
best_xgb_model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
best_xgb_model.fit(X_train,y_train)

In [20]:
XGB_pred_val = best_xgb_model.predict(X_val)

In [21]:
XGB_mae = MAE(y_val, XGB_pred_val)
print(f"MAE XGB: {XGB_mae:.5f}")

MAE XGB: 604941.97929


In [32]:
XGB_pred_test = best_xgb_model.predict(test_final)

In [33]:
res = pd.DataFrame(XGB_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-XGB.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6163901.0
51775,1002955.0
115253,2177441.0
299321,1683117.0
173570,843052.2


In [None]:
#a partir de aca no le den bola, copie y pegue cosas del notebook de Navent

## DummyRegressor

In [4]:
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train.drop(['precio'] + drop_cols, axis=1)
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [12]:
dummy = DummyRegressor(strategy='mean').fit(X_train, y_train)
pred = dummy.predict(X_test)

In [13]:
print(f"Promedio de precios del train set: {y_train.mean()}")
print(f"Primeras 3 predicciones: {pred[:3]}")

Promedio de precios del train set: 2536913.142061111
Primeras 3 predicciones: [2536913.14206111 2536913.14206111 2536913.14206111]


In [16]:
dummy_mae = MAE(y_test, pred)
print(f"MAE DummyRegressor: {dummy_mae:.5f}")

MAE DummyRegressor: 1602549.96274


## Regresión Lineal

In [4]:
X = train[['metroscubiertos']]
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

Train shapes: X=(180000, 1) y=(180000,)
Test  shapes: X=(60000, 1)  y=(60000,)


In [12]:
imp = Imputer()
X_train['metroscubiertos'] = imp.fit_transform(X_train[['metroscubiertos']])
X_test['metroscubiertos'] = imp.transform(X_test[['metroscubiertos']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
linear_model = LinearRegression().fit(X_train, y_train)
pred = linear_model.predict(X_test)

In [24]:
linear_mae = MAE(y_test, pred)
print(f"MAE Linear: {linear_mae:.5f}")

MAE Linear: 1191167.10603


In [None]:
-------------------------------------------------------------------------------------------------------------------------

In [50]:
# Imputamos los NaNs
imp = Imputer()
train['metroscubiertos'] = imp.fit_transform(train[['metroscubiertos']])
test['metroscubiertos'] = imp.transform(test[['metroscubiertos']])

linear_pred = LinearRegression()\
                    .fit(train[['metroscubiertos']], train['precio'])\
                    .predict(test[['metroscubiertos']])

res = pd.DataFrame(linear_pred, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-linear.csv", header=True) # RMSLE=0.65487



Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,4200756.0
51775,1112322.0
115253,1377424.0
299321,1364169.0
173570,1284638.0
