In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import datasets
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv(r'Escritorio/Orga/train.csv', index_col=0)
test = pd.read_csv(r'Escritorio/Orga/test.csv', index_col=0)

In [3]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [4]:
#preguntar porque esto funciona mejor
train = train.fillna(0)
test = test.fillna(0)

In [5]:
train['metrostotales'] = train['metrostotales'].fillna(round(train['metrostotales'].mean()))
test['metrostotales'] = test['metrostotales'].fillna(round(test['metrostotales'].mean()))
train['metroscubiertos'] = train['metroscubiertos'].fillna(round(train['metroscubiertos'].mean()))
test['metroscubiertos'] = test['metroscubiertos'].fillna(round(test['metroscubiertos'].mean()))
train['banos'] = train['banos'].fillna(round(train['banos'].mean()))
test['banos'] = test['banos'].fillna(round(test['banos'].mean()))
train['habitaciones'] = train['habitaciones'].fillna(round(train['habitaciones'].mean()))
test['habitaciones'] = test['habitaciones'].fillna(round(test['habitaciones'].mean()))
train['garages'] = train['garages'].fillna(round(train['garages'].mean()))
test['garages'] = test['garages'].fillna(round(test['garages'].mean()))
train['antiguedad'] = train['antiguedad'].fillna(0) #es el valor mas frecuente
test['antiguedad'] = test['antiguedad'].fillna(0)

train['idzona'] = train['idzona'].fillna(0)
test['idzona'] = test['idzona'].fillna(0)
train['direccion'] = train['direccion'].fillna(0)
test['direccion'] = test['direccion'].fillna(0)
train['ciudad'] = train['ciudad'].fillna(0)
test['ciudad'] = test['ciudad'].fillna(0)
train['provincia'] = train['provincia'].fillna(0)
test['provincia'] = test['provincia'].fillna(0)
train['titulo'] = train['titulo'].fillna(0)
test['titulo'] = test['titulo'].fillna(0)
train['tipodepropiedad'] = train['tipodepropiedad'].fillna(0)
test['tipodepropiedad'] = test['tipodepropiedad'].fillna(0)
train['descripcion'] = train['descripcion'].fillna(0)
test['descripcion'] = test['descripcion'].fillna(0)

In [5]:
#descarto latitud y longitud
del train['lat']
del test['lat']
del train['lng']
del test['lng']

In [6]:
#el idzona ya es un encode en si mismo, solo lo paso a string porque lo toma como float
train['idzona'] = train['idzona'].astype(str)
test['idzona'] = test['idzona'].astype(str)

## Agregando algunas Features

In [7]:
#hay que agregar los features al test set?
train['cant_amenities'] = train['usosmultiples'] + train['piscina'] + train['gimnasio']
test['cant_amenities'] = test['usosmultiples'] + test['piscina'] + test['gimnasio']

In [8]:
def features_descripcion(lista_features, train, test):
    for i in range(len(lista_features)):
        train[lista_features[i]] = train['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
        test[lista_features[i]] = test['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
    return train, test

In [9]:
lista_features = ['luminoso', 'jardin', 'terraza', 'patio', 'balcón']
train, test = features_descripcion(lista_features, train, test)

In [10]:
train['fecha'] = pd.to_datetime(train['fecha'])
test['fecha'] = pd.to_datetime(test['fecha'])

In [11]:
train['año_y_mes'] = train['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)
test['año_y_mes'] = test['fecha'].map(lambda x: 100 * x.year + x.month).astype(str)

In [12]:
train['año'] = train['fecha'].dt.year
test['año'] = test['fecha'].dt.year
train['mes'] = train['fecha'].dt.month
test['mes'] = test['fecha'].dt.month

In [13]:
encoder = LabelEncoder()
encode_cols = ['tipodepropiedad', 'provincia', 'ciudad']
train[encode_cols] = train[encode_cols].astype(str)
test[encode_cols] = test[encode_cols].astype(str)

In [14]:
train['tipodepropiedad'] = encoder.fit_transform(train['tipodepropiedad']).astype(str)
test['tipodepropiedad'] = encoder.fit_transform(test['tipodepropiedad']).astype(str)
train['provincia'] = encoder.fit_transform(train['provincia']).astype(str)
test['provincia'] = encoder.fit_transform(test['provincia']).astype(str)
train['ciudad'] = encoder.fit_transform(train['ciudad']).astype(str)
test['ciudad'] = encoder.fit_transform(test['ciudad']).astype(str)

In [17]:
train = train.join(pd.get_dummies(train['provincia'], prefix = 'prov'))
test = test.join(pd.get_dummies(test['provincia'], prefix = 'prov'))

In [20]:
train = train.join(pd.get_dummies(train['ciudad'], prefix = 'ciudad'))
test = test.join(pd.get_dummies(test['ciudad'], prefix = 'ciudad'))

In [17]:
train = train.join(pd.get_dummies(train['tipodepropiedad'], prefix = 'tipo'))
test = test.join(pd.get_dummies(test['tipodepropiedad'], prefix = 'tipo'))

In [None]:
#train = train.join(pd.get_dummies(train['idzona'], prefix = 'zona'))
#test = test.join(pd.get_dummies(test['idzona'], prefix = 'zona'))

In [20]:
drop_cols = ['fecha', 'ciudad', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train.drop(['precio'] + drop_cols, axis=1)
y = train['precio']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
test = test.drop(drop_cols, axis=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}  y={y_val.shape}")
print(f"Test shape: {test.shape}")

Train shapes: X=(180000, 955) y=(180000,)
Validation shapes: X=(60000, 955)  y=(60000,)
Test shape: (60000, 654)


## KNN 

In [21]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train, y_train) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [22]:
KNN_pred_val = neigh.predict(X_val)

In [23]:
KNN_mae = MAE(y_val, KNN_pred_val)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 723747.59450


In [23]:
KNN_pred_test = neigh.predict(test_final)

In [24]:
res = pd.DataFrame(KNN_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-KNN.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,3771234.0
51775,544833.3
115253,1145408.0
299321,885207.3
173570,662333.3


## XGBoost

In [None]:
#tome los hiperparametros de una pagina cualquiera
best_xgb_model = XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
best_xgb_model.fit(X_train,y_train)

In [20]:
XGB_pred_val = best_xgb_model.predict(X_val)

In [21]:
XGB_mae = MAE(y_val, XGB_pred_val)
print(f"MAE XGB: {XGB_mae:.5f}")

MAE XGB: 604941.97929


In [32]:
XGB_pred_test = best_xgb_model.predict(test_final)

In [33]:
res = pd.DataFrame(XGB_pred_test, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-XGB.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6163901.0
51775,1002955.0
115253,2177441.0
299321,1683117.0
173570,843052.2


In [None]:
#a partir de aca no le den bola, copie y pegue cosas del notebook de Navent

## DummyRegressor

In [4]:
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = train.drop(['precio'] + drop_cols, axis=1)
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [12]:
dummy = DummyRegressor(strategy='mean').fit(X_train, y_train)
pred = dummy.predict(X_test)

In [13]:
print(f"Promedio de precios del train set: {y_train.mean()}")
print(f"Primeras 3 predicciones: {pred[:3]}")

Promedio de precios del train set: 2536913.142061111
Primeras 3 predicciones: [2536913.14206111 2536913.14206111 2536913.14206111]


In [16]:
dummy_mae = MAE(y_test, pred)
print(f"MAE DummyRegressor: {dummy_mae:.5f}")

MAE DummyRegressor: 1602549.96274


## Regresión Lineal

In [4]:
X = train[['metroscubiertos']]
y = train['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

Train shapes: X=(180000, 1) y=(180000,)
Test  shapes: X=(60000, 1)  y=(60000,)


In [12]:
imp = Imputer()
X_train['metroscubiertos'] = imp.fit_transform(X_train[['metroscubiertos']])
X_test['metroscubiertos'] = imp.transform(X_test[['metroscubiertos']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
linear_model = LinearRegression().fit(X_train, y_train)
pred = linear_model.predict(X_test)

In [24]:
linear_mae = MAE(y_test, pred)
print(f"MAE Linear: {linear_mae:.5f}")

MAE Linear: 1191167.10603


In [None]:
-------------------------------------------------------------------------------------------------------------------------

In [50]:
# Imputamos los NaNs
imp = Imputer()
train['metroscubiertos'] = imp.fit_transform(train[['metroscubiertos']])
test['metroscubiertos'] = imp.transform(test[['metroscubiertos']])

linear_pred = LinearRegression()\
                    .fit(train[['metroscubiertos']], train['precio'])\
                    .predict(test[['metroscubiertos']])

res = pd.DataFrame(linear_pred, index=test.index, columns=['precio'])
res = res.rename(columns={'precio':'target'})
display(res.head())
res.to_csv("workshop-submission-linear.csv", header=True) # RMSLE=0.65487



Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,4200756.0
51775,1112322.0
115253,1377424.0
299321,1364169.0
173570,1284638.0


In [35]:
drop_cols = ['fecha', 'ciudad', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
train_final_2012 = train_final[train_final['año_y_mes'].str.contains('2012', regex=False)]
X = train_final_2012.drop(['precio'] + drop_cols, axis=1)
y = train_final_2012['precio']
X_train_2012, X_val_2012, y_train_2012, y_val_2012 = train_test_split(X, y, test_size=0.25, random_state=1)
#test_final = test_final.drop(drop_cols, axis=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}  y={y_val.shape}")
#print(f"Test shape: {test_final.shape}")

Train shapes: X=(180000, 24) y=(180000,)
Validation shapes: X=(60000, 24)  y=(60000,)


In [42]:
y_val_2012.shape

(5884,)

In [46]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train_2012, y_train_2012) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [47]:
KNN_pred_val_2012 = neigh.predict(X_val_2012)

In [48]:
KNN_mae = MAE(y_val_2012, KNN_pred_val_2012)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 667757.50463


In [18]:
X_val_2016 = X_val[X_val['año_y_mes'].str.contains('2016', regex=False)]

In [20]:
X_val_2016['año_y_mes'].value_counts()

201612    7285
201610    1723
201606    1705
201604    1665
201608    1588
201607    1512
201601    1502
201611    1501
201609    1469
201605    1367
201603    1254
201602    1196
Name: año_y_mes, dtype: int64

In [21]:
y_val

id
245130    2300000.0
179086    2262945.0
87774     3800000.0
260997    1000000.0
109591     376200.0
            ...    
266839    2150000.0
169155    3300000.0
23033     9980000.0
19715      565000.0
202182    1910000.0
Name: precio, Length: 60000, dtype: float64

In [23]:
X_val_2016

Unnamed: 0_level_0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,...,cant_amenities,luminoso,jardin,terraza,patio,balcón,año_y_mes,tipo_encode,prov_encode,ciudad_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109591,0.0,2.0,1.0,1.0,40.0,90.0,0.0,25.795454,-100.213413,0.0,...,0.0,False,False,False,False,False,201606,3,19,56
185051,10.0,3.0,1.0,2.0,115.0,120.0,79422.0,0.000000,0.000000,0.0,...,0.0,False,False,False,False,False,201609,3,21,509
6201,0.0,3.0,0.0,0.0,312.0,180.0,0.0,0.000000,0.000000,0.0,...,0.0,False,False,False,False,False,201606,3,21,509
184162,5.0,2.0,0.0,2.0,160.0,0.0,56501.0,0.000000,0.000000,0.0,...,1.0,False,True,False,False,False,201612,3,11,818
287546,0.0,3.0,2.0,2.0,160.0,160.0,113891.0,0.000000,0.000000,0.0,...,0.0,False,False,False,True,False,201612,3,31,436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42970,5.0,3.0,2.0,2.0,238.0,160.0,23719.0,0.000000,0.000000,0.0,...,0.0,False,False,False,True,False,201602,3,9,203
185965,2.0,3.0,0.0,4.0,207.0,235.0,55093.0,19.519204,-99.176962,0.0,...,0.0,False,False,False,False,False,201612,3,11,106
3453,10.0,3.0,1.0,3.0,261.0,132.0,24474.0,19.313401,-99.243883,0.0,...,0.0,False,False,False,False,False,201612,4,9,371
274823,5.0,3.0,2.0,2.0,170.0,170.0,308261.0,20.695074,-103.385692,0.0,...,0.0,False,False,True,False,False,201612,1,15,274


In [24]:
y_val_2016 = pd.merge(X_val_2016, y_val, on='id', how='inner')

In [29]:
y_val_2016 = y_val_2016.reset_index()[['id', 'precio']]

In [32]:
y_val_2016 = y_val_2016.set_index('id')

In [33]:
y_val_2016

Unnamed: 0_level_0,precio
id,Unnamed: 1_level_1
109591,376200.0
185051,650000.0
6201,4550000.0
184162,2100000.0
287546,1250000.0
...,...
42970,3200000.0
185965,2750000.0
3453,3200000.0
274823,3850000.0


In [35]:
y_train

id
267694    5000000.0
224166    2100000.0
147987    5749000.0
144982    7800000.0
251965    1100000.0
            ...    
188691    9700000.0
206925     470000.0
272675    3500000.0
105474    2295000.0
296558    8000000.0
Name: precio, Length: 180000, dtype: float64

In [39]:
neigh = KNeighborsRegressor(n_neighbors=12)
neigh.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='uniform')

In [40]:
KNN_pred_val_2016 = neigh.predict(X_val_2016)

In [59]:
KNN_mae = MAE(y_val_2016['precio'], KNN_pred_val_2016)
print(f"MAE KNN: {KNN_mae:.5f}")

MAE KNN: 811950.17369


In [56]:
y_val_2016

Unnamed: 0_level_0,precio
id,Unnamed: 1_level_1
109591,376200.0
185051,650000.0
6201,4550000.0
184162,2100000.0
287546,1250000.0
...,...
42970,3200000.0
185965,2750000.0
3453,3200000.0
274823,3850000.0


In [60]:
KNN_pred_val_2016

array([ 486380.        , 1351666.66666667, 4138750.        , ...,
       5949166.66666667, 4594150.41666667, 1643333.33333333])