In [13]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [14]:
data = pd.read_csv('data_caba.csv')

pd.options.display.max_columns = None

In [15]:
# Creamos rangos de m2 con un pd.cut para falicitar el análisis de los tipos de propiedades

bins = [0,50,100,300000]
cuantil = pd.cut(data['surface_total_in_m2'], bins, right=False)
cuantil.value_counts()

[50, 100)        10432
[0, 50)          10359
[100, 300000)     8193
Name: surface_total_in_m2, dtype: int64

In [16]:
pd.options.display.max_rows = None
grp_barrio = data.groupby(['property_type',cuantil])[['price_usd_per_m2','price_aprox_usd']].mean().round(2)
grp_barrio

Unnamed: 0_level_0,Unnamed: 1_level_0,price_usd_per_m2,price_aprox_usd
property_type,surface_total_in_m2,Unnamed: 2_level_1,Unnamed: 3_level_1
PH,"[0, 50)",2462.22,101316.32
PH,"[50, 100)",1932.93,141754.9
PH,"[100, 300000)",1418.86,252918.96
apartment,"[0, 50)",4048.39,136986.27
apartment,"[50, 100)",2652.35,180648.64
apartment,"[100, 300000)",3208.24,574940.96
house,"[0, 50)",8044.27,282143.82
house,"[50, 100)",2442.44,187791.59
house,"[100, 300000)",1388.61,527297.4
store,"[0, 50)",3683.79,103491.4


In [17]:
data.groupby(['place_name'])[['price_aprox_usd']].count().sort_values(by='price_aprox_usd',ascending=False).head()


Unnamed: 0_level_0,price_aprox_usd
place_name,Unnamed: 1_level_1
Belgrano,2532
Palermo,2515
Caballito,2055
Recoleta,1424
Villa Urquiza,1361


In [18]:
data.groupby(['place_name'])[['price_usd_per_m2']].mean().sort_values(by='price_usd_per_m2',ascending=False).head()

Unnamed: 0_level_0,price_usd_per_m2
place_name,Unnamed: 1_level_1
Boedo,10697.513056
San Cristobal,9952.38857
Puerto Madero,6029.687303
Palermo Chico,4797.326384
Las Cañitas,3476.175811


Trabajaremos sobre un subdata con los tipos de propiedades house, PH y apartment de la Comuna 1 de CABA

In [19]:
# Seleccionamos los barrios de Caba pertenecientes a la Comuna 1

data_comuna_1 = data[(data['place_name']=='Retiro') | (data['place_name']=='San Nicolás') | (data['place_name']=='Monserrat') | (data['place_name']== 'Puerto Madero') | (data['place_name']=='San Telmo') | (data['place_name']=='Constitución')]

In [20]:
# Quitamos el tipo de propiedad store

data_c1 = data_comuna_1[(data_comuna_1['property_type']!='store')]

In [21]:
# Verificamos que no exista ningún outlier en los mt2

data_c1['surface_total_in_m2'].sort_values(ascending=False).head(3)

27225    1183.0
4572     1159.0
7128      950.0
Name: surface_total_in_m2, dtype: float64

## Modelo Ridge Con Tres Features


In [22]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [26]:
data_c1.head(2)

Unnamed: 0.1,Unnamed: 0,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,description,title,image_thumbnail,TitleClean,DescriptionClean,Title_m2,SinUSD,SinUSS,Sin_m2,id,price_usd_m2_filled,price_m2_filled,mts2_completar,lat_filled,lon_filled,geometry,distancia_obelisco,surface_total_in_m2_2
13,47,sell,house,Puerto Madero,|Argentina|Capital Federal|Puerto Madero|,Argentina,Capital Federal,3429760.0,"-34.6177194,-58.3620561",-34.617719,-58.362056,70000.0,USD,1235115.0,70000.0,80.0,50.0,875.0,1400.0,,,,http://www.properati.com.ar/15bu1_venta_casa_p...,ESTRUCTURA ECOLOGICA FLOTANTEIDEAL PARA UBICAR...,FLOAT LIFE - CASA FLOTANTE - UBICABLE EN CLUBS...,https://thumbs4.properati.com/5/UvWlBF2qAapN1p...,,,,,,,13,875.0,1400.0,80.0,-34.617719,-58.362056,POINT (-58.3620561 -34.6177194),2287.93056,160.0
24,97,sell,apartment,San Telmo,|Argentina|Capital Federal|San Telmo|,Argentina,Capital Federal,3428113.0,"-34.6247504837,-58.3791482449",-34.62475,-58.379148,1423800.0,ARS,1408947.37,79852.0,34.462386,,,,,1.0,,http://www.properati.com.ar/15cii_venta_depart...,Torre de 15 pisos con amenities ubicados en el...,VIVÍ CERCA DE TODO Av. 9 de Julio,https://thumbs4.properati.com/2/ufRxXZ9qkWC7ma...,,,,,,,24,2317.076969,16304.485007,34.462386,-34.62475,-58.379148,POINT (-58.37914824489999 -34.6247504837),1161.974369,68.924771


In [27]:
data_c1['surface_total_in_m2_2'] = 2 * data_c1.surface_total_in_m2

feature_cols = ['surface_total_in_m2', 'distancia_obelisco', 'surface_total_in_m2_2' ]

X = data_c1[feature_cols]

scaler = StandardScaler()

X_std = scaler.fit_transform(X)

y = data_c1.price_aprox_usd


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = 117)

model_ridge_cv = linear_model.RidgeCV(alphas= [0.3, 0.5, 1.0, 1.1, 1.15, 1.17, 1.18, 1.19, 1.2, 1.21, 1.22, 1.3, 1.4, 1.5, 10.0], 
                                   fit_intercept=True, normalize=False, cv=10)

model_fit_ridge_cv = model_ridge_cv.fit(X_train, y_train)

print(model_fit_ridge_cv.alpha_)
print(model_fit_ridge_cv.best_score_)


10.0
0.5290124091089752


In [29]:
best_alpha = model_fit_ridge_cv.alpha_
model_ridge = linear_model.Ridge(alpha = best_alpha, fit_intercept = True, normalize = False)
model_fit_ridge = model_ridge.fit(X_train, y_train)
print(model_fit_ridge.coef_)
print(model_fit_ridge.intercept_)
print(model_fit_ridge.score(X_train, y_train))

[158467.07210102 100022.15937712 158467.07210101]
305257.96303278836
0.5892540598344154


In [30]:
model_fit_ridge.score(X_test, y_test)

0.514538926146476

## Modelo Lasso Con Tres Features


In [32]:
model_lasso_cv = linear_model.LassoCV(alphas= [0.05, 0.06, 0.07, 0.08, 0.09, 0.95, 0.97, 0.1, 0.12, 0.15, 0.2, 0.3, 1.0, 10.0], 
                                   fit_intercept=True, normalize=False, cv=10, max_iter=1e4)

model_fit_lasso_cv = model_lasso_cv.fit(X_train, y_train)

print(model_fit_lasso_cv.alpha_)

print(model_fit_lasso_cv.score(X_train, y_train))

0.3
0.5892581755118205


In [33]:
best_alpha = model_fit_lasso_cv.alpha_
model_lasso = linear_model.Lasso(alpha = best_alpha, fit_intercept = True, normalize = False)
model_fit_lasso = model_lasso.fit(X_train, y_train)
print(model_fit_lasso.coef_)
print(model_fit_lasso.intercept_)
print(model_fit_lasso.score(X_test, y_test))

[345489.68749828 100329.42836224 -27741.03948671]
305263.13562713104
0.5141575690138557


## Ridge Polinomica


In [35]:
data_c1["surface_total_in_m2_3"] = data_c1.surface_total_in_m2 * data_c1.surface_total_in_m2_2
data_c1["surface_total_in_m2_4"] = data_c1.surface_total_in_m2 * data_c1.surface_total_in_m2_3
data_c1["surface_total_in_m2_5"] = data_c1.surface_total_in_m2 * data_c1.surface_total_in_m2_4

In [36]:
feature_cols = ['surface_total_in_m2', 'surface_total_in_m2_2', 'surface_total_in_m2_3', 'surface_total_in_m2_4', 'surface_total_in_m2_5' ]

X2 = data_c1[feature_cols]

scaler = StandardScaler()

X_std2 = scaler.fit_transform(X)

y2 = data_c1.price_aprox_usd

In [37]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_std2, y2, test_size = 0.3, random_state = 117)

model_ridge_cv = linear_model.RidgeCV(alphas= [0.3, 0.5, 1.0, 1.1, 1.15, 1.17, 1.18, 1.19, 1.2, 1.21, 1.22, 1.3, 1.4, 1.5, 10.0], 
                                   fit_intercept=True, normalize=False, cv=10)

model_fit_ridge_cv = model_ridge_cv.fit(X_train2, y_train2)

print(model_fit_ridge_cv.alpha_)
print(model_fit_ridge_cv.best_score_)


10.0
0.5290124091089752


In [38]:
best_alpha = model_fit_ridge_cv.alpha_
model_ridge = linear_model.Ridge(alpha = best_alpha, fit_intercept = True, normalize = False)
model_fit_ridge = model_ridge.fit(X_train2, y_train2)
print(model_fit_ridge.coef_)
print(model_fit_ridge.intercept_)
print(model_fit_ridge.score(X_train2, y_train2))

[158467.07210102 100022.15937712 158467.07210101]
305257.96303278836
0.5892540598344154


In [39]:
model_fit_ridge.score(X_test2, y_test2)

0.514538926146476