<h1 style="text-align: center;" markdown="1">TP2: Ensamble de Gradient Boosting y Elastic Net</h1>
![](../data/icon_properati-data.png)

In [23]:
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn import ensemble, tree, linear_model

import pandas as pd

# modules
import knn as knnlibrary

In [13]:
#leo dataset
properties =knnlibrary.get_dataset()

# filtro por CABA y GBA
properties_caba = properties[(properties['place_with_parent_names'].str.contains('Capital Federal') \
                             | properties['place_with_parent_names'].str.contains('Bs.As. G.B.A.'))]

# queremos solo las propiedades que tienen precio y eliminamos columnas que sabemos que no son redundantes y que no nos servirian para knn
properties_caba = properties_caba.drop(['currency','price_usd_per_m2','price_usd_per_m2','price_per_m2','price_aprox_usd','price_aprox_local_currency',\
                      'id','properati_url','image_thumbnail','description','title',\
                      'lat-lon','geonames_id'], axis = 1)

properties_caba = properties_caba[properties_caba['price'].notnull() & properties_caba['price'] > 0 & properties_caba['place_name'].notnull()]

# eliminamos propiedades con mas de 54 pisos
properties_caba = properties_caba[properties_caba['floor'] <= 54]
# eliminamos propiedades con mas de 9 pisos - ver analisis
properties_caba = properties_caba[properties_caba['rooms'] <= 9]
# eliminamos propiedades con mas de 2000 m2 de superficie cubierta - ver analisis
properties_caba = properties_caba[(properties_caba['surface_covered_in_m2'] <= 3000) & (properties_caba['surface_covered_in_m2'] >= 0)]
# eliminamos propiedades con mas de 2000 m2 de superficie cubierta - ver analisis
properties_caba = properties_caba[(properties_caba['surface_total_in_m2']<= 5000)  & (properties_caba['surface_total_in_m2'] >= 0)]


# atributos categoricos
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# transformo el campo fecha
properties_caba['created_on'] = encoder.fit_transform(properties_caba[['created_on']])
properties_caba['country_name'] = encoder.fit_transform(properties_caba[['country_name']])
properties_caba['operation'] = encoder.fit_transform(properties_caba[['operation']])

# las expensas tienen demasiados nulos por lo que voy a eliminar esa columna
properties_caba = properties_caba.drop(['expenses'], axis = 1)

# atributos categoricos
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
properties_caba = knnlibrary.encoder_attributes(properties_caba, encoder)

properties_caba.tail()

# eliminamos filas con valores nulo
properties_caba = properties_caba.dropna(how='any')
properties_caba.info()

  y = column_or_1d(y, warn=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8495 entries, 16 to 34504
Data columns (total 14 columns):
created_on                 8495 non-null int64
operation                  8495 non-null int64
property_type              8495 non-null int64
place_name                 8495 non-null int64
place_with_parent_names    8495 non-null int64
country_name               8495 non-null int64
state_name                 8495 non-null int64
lat                        8495 non-null float64
lon                        8495 non-null float64
price                      8495 non-null float64
surface_total_in_m2        8495 non-null float64
surface_covered_in_m2      8495 non-null float64
floor                      8495 non-null float64
rooms                      8495 non-null float64
dtypes: float64(7), int64(7)
memory usage: 995.5 KB


In [14]:
# filtro columnas segun lo que hay en el dataset
attributes = ['created_on','property_type','operation','place_name','place_with_parent_names',\
              'country_name','state_name','lat','lon','surface_total_in_m2','surface_covered_in_m2',\
              'floor','rooms', 'price']
properties_caba[attributes].info()
properties_caba_with_price_attributes = properties_caba[attributes]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8495 entries, 16 to 34504
Data columns (total 14 columns):
created_on                 8495 non-null int64
property_type              8495 non-null int64
operation                  8495 non-null int64
place_name                 8495 non-null int64
place_with_parent_names    8495 non-null int64
country_name               8495 non-null int64
state_name                 8495 non-null int64
lat                        8495 non-null float64
lon                        8495 non-null float64
surface_total_in_m2        8495 non-null float64
surface_covered_in_m2      8495 non-null float64
floor                      8495 non-null float64
rooms                      8495 non-null float64
price                      8495 non-null float64
dtypes: float64(7), int64(7)
memory usage: 995.5 KB


In [15]:
# separamos el set de train
from sklearn.model_selection import train_test_split
import datetime

now = datetime.datetime.now()

# separamos el train de traing para validarlo luego usando un 30% de los datos
X, y = properties_caba_with_price_attributes.iloc[:, properties_caba_with_price_attributes.columns != 'price'].values, properties_caba_with_price_attributes.iloc[:, properties_caba_with_price_attributes.columns == 'price'].values
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state=now.microsecond)
    
print(y_train)
    
len(X_test), len(X_train)

[[ 140000.]
 [ 248000.]
 [ 185000.]
 ..., 
 [  68300.]
 [ 112000.]
 [ 350000.]]


(1699, 6796)

In [16]:
# preprocesamiento
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Prints R2 and RMSE scores
def get_score(prediction, lables):    
    print('R2: {}'.format(r2_score(prediction, lables)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))
    
# Shows scores for train and validation sets    
def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
    prediction_train = estimator.predict(x_trn)
    # Printing estimator
    print(estimator)
    # Printing train scores
    get_score(prediction_train, y_trn)
    prediction_test = estimator.predict(x_tst)
    # Printing test scores
    print("Test")
    get_score(prediction_test, y_tst)
    
# train ElasticNetCV
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(X_train_scaled, y_train)
train_test(ENSTest, X_train_scaled, X_test_scaled, y_train, y_test)

ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], copy_X=True,
       cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.01, 0.1, 0.5, 0.9, 0.99], max_iter=5000, n_alphas=100,
       n_jobs=1, normalize=False, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)
R2: -36.74284924937599
RMSE: 338015.15635734535
Test
R2: -63.04662582541725
RMSE: 349083.19032765273


  y = column_or_1d(y, warn=True)


In [25]:
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(ENSTest, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.01 (+/- 0.11)


  y = column_or_1d(y, warn=True)


In [26]:
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='huber').fit(X_train_scaled, y_train)
train_test(GBest, X_train_scaled, X_test_scaled, y_train, y_test)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=3000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
R2: -0.4722491349786777
RMSE: 247722.16461530808
Test
R2: -1.0802426653438157
RMSE: 279995.73764897505


In [28]:
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(GBest, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.44 (+/- 0.40)


# TEST

In [33]:
# leemos set de test
test_df = pd.read_csv('../data/test/properati_dataset_testing_noprice.csv', low_memory=False)
test_df.head()

Unnamed: 0,id,created_on,property_type,operation,place_name,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description
0,3632,2017-08-24,departamento,venta,Puerto Madero,|Argentina|Capital Federal|Puerto Madero|,Argentina,Capital Federal,"-34.6109877599,-58.3634635778",-34.610988,-58.363464,0.0,,,,,Edificio BA Houses situado frente al Dique 3 d...
1,3633,2017-08-25,departamento,venta,Buenos Aires Interior,|Argentina|Buenos Aires Interior|,Argentina,Buenos Aires Interior,,,,0.0,,,,,El departamento cuenta con un living-comedor a...
2,2263404,2017-08-01,departamento,venta,Palermo Soho,|Argentina|Capital Federal|Palermo|Palermo Soho|,Argentina,Capital Federal,"-34.5893633232,-58.4128798588",-34.589363,-58.41288,53.0,48.0,,,1500.0,IMPECABLE TORRE COY III – DEPA...
3,2263405,2017-08-01,departamento,venta,Chacarita,|Argentina|Capital Federal|Chacarita|,Argentina,Capital Federal,,,,39.0,39.0,,,,AMBIENTE DIVISIBLE CON PISOS D...
4,2263406,2017-08-01,departamento,venta,Chacarita,|Argentina|Capital Federal|Chacarita|,Argentina,Capital Federal,,,,51.0,51.0,,,,LIVING COMEDOR CON PISOS DE PO...


In [49]:
# transformamos atributos categoricos
test_df['place_name'] = encoder.fit_transform(test_df[['place_name']])
test_df['state_name'] = encoder.fit_transform(test_df[['state_name']])
test_df['place_with_parent_names'] = encoder.fit_transform(test_df[['place_with_parent_names']])
test_df['property_type'] = encoder.fit_transform(test_df[['property_type']])
test_df['created_on'] = encoder.fit_transform(test_df[['created_on']])
test_df['country_name'] = encoder.fit_transform(test_df[['country_name']])
test_df['operation'] = encoder.fit_transform(test_df[['operation']])

# tranformamos fechas
X_test_df = test_df[['created_on','property_type','operation','place_name','place_with_parent_names',\
              'country_name','state_name','lat','lon','surface_total_in_m2','surface_covered_in_m2',\
              'floor','rooms']]

# completamos valores nan
from sklearn.preprocessing import Imputer
imputer_mean = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_test_df['floor'] = X_test_df[['floor']].fillna(1)
X_test_df['rooms'] = X_test_df[['rooms']].fillna(1)

X_test_df["surface_total_in_m2"] = imputer_mean.fit_transform(X_test_df[["surface_total_in_m2"]])
X_test_df["surface_covered_in_m2"] = imputer_mean.fit_transform(X_test_df[["surface_covered_in_m2"]])
X_test_df["lat"] = imputer_mean.fit_transform(X_test_df[["lat"]])
X_test_df["lon"] = imputer_mean.fit_transform(X_test_df[["lon"]])

#X_test_std_df = stdsc.transform(X_test_df)

X_test_df.head()

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-

Unnamed: 0,created_on,property_type,operation,place_name,place_with_parent_names,country_name,state_name,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms
0,23,1,1,152,219,0,4,-34.610988,-58.363464,0.0,359.471588,1.0,1.0
1,24,1,1,29,180,0,3,-34.629923,-58.46582,0.0,359.471588,1.0,1.0
2,0,1,1,136,210,0,4,-34.589363,-58.41288,53.0,48.0,1.0,1.0
3,0,1,1,40,193,0,4,-34.629923,-58.46582,39.0,39.0,1.0,1.0
4,0,1,1,40,193,0,4,-34.629923,-58.46582,51.0,51.0,1.0,1.0


In [50]:
# Retraining models
GB_model = GBest.fit(X_train_scaled, y_train)
ENST_model = ENSTest.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [60]:
## Getting our SalePrice estimation
#Final_labels = (np.exp(GB_model.predict(X_test_df)) + np.exp(ENST_model.predict(X_test_df))) / 2
Final_labels = (GB_model.predict(X_test_df) + ENST_model.predict(X_test_df)) / 2
print(Final_labels)

[ 6366174.61071105  5636213.7912012   2709901.78404069 ...,
  4784388.14021986  2173358.415706    5077580.32843081]


In [61]:
output = pd.DataFrame( data={"id":test_df["id"], "price_usd":Final_labels} )

In [62]:
# guardamos
output.to_csv( "../data/result/result_GradientBoostingyElasticNet_"+str(now)+".csv", index=False, quoting=3 )