# Digital House - Data Science a Distancia

## Trabajo Práctico 2



### Autores: Nahuel Bonfante, Daniel Borrino, Ivan Mongi, Jessica Polakoff, Julio Tentor

<p style="text-align:right;">Abril 2022</p>

#### Aspectos técnicos
La notebook se ejecuta correctamente en una instalación estándar de Anaconda versión 4.11.0 build  3.21.6, Python 3.9.7


#### Librerías necesarias

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt

In [2]:
data_final_url = "../Data/properatti_final.csv"
data_final = pd.read_csv(data_final_url, encoding="utf-8")

In [3]:
data_final.shape

(67268, 10)

In [4]:
data_final.dtypes

Unnamed: 0                 int64
property_type             object
place_name                object
state_name                object
price_aprox_usd          float64
surface_covered_in_m2    float64
cochera                    int64
pileta                     int64
parrilla                   int64
ambientes_final          float64
dtype: object

In [5]:
data_final.head()

Unnamed: 0.1,Unnamed: 0,property_type,place_name,state_name,price_aprox_usd,surface_covered_in_m2,cochera,pileta,parrilla,ambientes_final
0,2,apartment,Mataderos,Capital Federal,72000.0,55.0,0,0,0,2.0
1,4,apartment,Centro,Buenos Aires Costa Atlántica,64000.0,35.0,0,0,0,
2,7,apartment,Belgrano,Capital Federal,138000.0,40.0,0,1,0,
3,8,apartment,Belgrano,Capital Federal,195000.0,60.0,0,1,0,
4,13,apartment,Palermo Soho,Capital Federal,111700.0,30.0,0,1,0,1.0


In [6]:
data_final[['price_aprox_usd', 'surface_covered_in_m2']].corr()

Unnamed: 0,price_aprox_usd,surface_covered_in_m2
price_aprox_usd,1.0,0.686467
surface_covered_in_m2,0.686467,1.0


---



In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [8]:
def train_LinearRegression(X, y) :
    u""" Performs an Ordinary Least Squares linear regression from Scikit-Learn linear models
    
    X array of array of features
    y array of target values

    Prints the R2 values for training and test values
    
    """
    
    model = LinearRegression(fit_intercept = True)
    
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 1)
    
    model.fit(Xtrain, ytrain)
    
    ymodel = model.predict(Xtrain)
    ypred = model.predict(Xtest)
    
    r2_train = r2_score(ytrain, ymodel).round(3)
    r2_test = r2_score(ytest, ypred).round(3)

    print("LinearRegression R2 values")
    print ('R2 train:', r2_train)
    print ('R2 test :', r2_test)


In [9]:
import statsmodels.api as sm

In [10]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [11]:
def stats_OLS(X, y) :
    u""" Performs an Ordinary Least Squares linear regression from statsmodels
    
    X array of array of features
    y array of target values

    Prints the p-values associated with each feature

    """
    X = sm.add_constant(X)
    result = sm.OLS(y, X).fit()
    
    print("statsmodels OLS pvalues")
    print(result.pvalues)


In [12]:
X = data_final[['surface_covered_in_m2']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)


statsmodels OLS pvalues
const                    0.0
surface_covered_in_m2    0.0
dtype: float64
LinearRegression R2 values
R2 train: 0.467
R2 test : 0.483


In [13]:

X = data_final[['surface_covered_in_m2','cochera']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)


statsmodels OLS pvalues
const                    0.000000e+00
surface_covered_in_m2    0.000000e+00
cochera                  3.014272e-13
dtype: float64
LinearRegression R2 values
R2 train: 0.468
R2 test : 0.484


In [14]:

X = data_final[['surface_covered_in_m2','pileta']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)


statsmodels OLS pvalues
const                    0.0
surface_covered_in_m2    0.0
pileta                   0.0
dtype: float64
LinearRegression R2 values
R2 train: 0.48
R2 test : 0.495


In [15]:

X = data_final[['surface_covered_in_m2','parrilla']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)


statsmodels OLS pvalues
const                     0.000000e+00
surface_covered_in_m2     0.000000e+00
parrilla                 6.711728e-225
dtype: float64
LinearRegression R2 values
R2 train: 0.475
R2 test : 0.492


In [16]:
property_type_dummies = pd.get_dummies(data = data_final['property_type'], prefix = 'property_type_', drop_first = True)

In [17]:
property_type_dummies.head()

Unnamed: 0,property_type__house
0,0
1,0
2,0
3,0
4,0


In [18]:
data_final['property_type_dummies'] = property_type_dummies

In [19]:

X = data_final[['surface_covered_in_m2','property_type_dummies', 'pileta']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)


statsmodels OLS pvalues
const                     0.000000e+00
surface_covered_in_m2     0.000000e+00
property_type_dummies     0.000000e+00
pileta                   6.046476e-302
dtype: float64
LinearRegression R2 values
R2 train: 0.53
R2 test : 0.548


In [20]:
# ESTE ES EL PROBLEMA !!!
data_final['state_name'].value_counts()

Capital Federal                 20763
Bs.As. G.B.A. Zona Norte        17831
Bs.As. G.B.A. Zona Sur           7428
Buenos Aires Costa Atlántica     5444
Córdoba                          4899
Bs.As. G.B.A. Zona Oeste         4496
Santa Fe                         3710
Buenos Aires Interior            1127
Mendoza                           348
Corrientes                        303
Río Negro                         166
Misiones                          159
Neuquén                           151
San Luis                          118
Tucumán                            75
Entre Ríos                         73
Salta                              56
Chubut                             32
Chaco                              24
Tierra Del Fuego                   24
La Pampa                           13
Santa Cruz                          8
Catamarca                           8
Jujuy                               6
La Rioja                            2
Santiago Del Estero                 2
San Juan    

In [21]:
# determino el "peso" de los primeros estados, más del 90% de los datos
rows = 8
sum(data_final['state_name'].value_counts()[ :rows ])/data_final.shape[0]

0.9766605220907415

In [22]:
# necesito una lista con los primeros estados
states = data_final['state_name'].value_counts().index[ :rows ]

In [23]:
# imputo np:NaN a todos los estados que no están en la lista de los primeros estados
data_final['state_name'] = data_final['state_name'].apply( lambda x : x if x in states else np.NaN )

In [24]:
# suprimo las observaciones que no me sirven
data_final = data_final.dropna(axis=0, how='any', subset=['state_name'])

In [25]:
data_final.shape

(65698, 11)

In [26]:
data_final[['price_aprox_usd', 'surface_covered_in_m2']].corr()

Unnamed: 0,price_aprox_usd,surface_covered_in_m2
price_aprox_usd,1.0,0.688263
surface_covered_in_m2,0.688263,1.0


In [27]:
X = data_final[['surface_covered_in_m2','property_type_dummies', 'pileta']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)

statsmodels OLS pvalues
const                     0.000000e+00
surface_covered_in_m2     0.000000e+00
property_type_dummies     0.000000e+00
pileta                   4.935401e-288
dtype: float64
LinearRegression R2 values
R2 train: 0.533
R2 test : 0.551


In [28]:
data_final['state_name'].value_counts()

Capital Federal                 20763
Bs.As. G.B.A. Zona Norte        17831
Bs.As. G.B.A. Zona Sur           7428
Buenos Aires Costa Atlántica     5444
Córdoba                          4899
Bs.As. G.B.A. Zona Oeste         4496
Santa Fe                         3710
Buenos Aires Interior            1127
Name: state_name, dtype: int64

In [29]:
state_name_dummies = pd.get_dummies(data = data_final['state_name'], prefix = 'state_', drop_first = True)

In [30]:
state_name_dummies.head()

Unnamed: 0,state__Bs.As. G.B.A. Zona Oeste,state__Bs.As. G.B.A. Zona Sur,state__Buenos Aires Costa Atlántica,state__Buenos Aires Interior,state__Capital Federal,state__Córdoba,state__Santa Fe
0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0


In [31]:
state_name_dummies.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            67258, 67259, 67260, 67261, 67262, 67263, 67264, 67265, 67266,
            67267],
           dtype='int64', length=65698)

In [32]:
data_final.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            67258, 67259, 67260, 67261, 67262, 67263, 67264, 67265, 67266,
            67267],
           dtype='int64', length=65698)

In [33]:
data_final = pd.concat([data_final, state_name_dummies], axis = 1)

In [34]:
list(state_name_dummies.columns)

['state__Bs.As. G.B.A. Zona Oeste',
 'state__Bs.As. G.B.A. Zona Sur',
 'state__Buenos Aires Costa Atlántica',
 'state__Buenos Aires Interior',
 'state__Capital Federal',
 'state__Córdoba',
 'state__Santa Fe']

In [35]:
X = data_final[['surface_covered_in_m2'] + list(state_name_dummies.columns) + ['property_type_dummies', 'pileta']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)

statsmodels OLS pvalues
const                                   0.000000e+00
surface_covered_in_m2                   0.000000e+00
state__Bs.As. G.B.A. Zona Oeste        4.778843e-136
state__Bs.As. G.B.A. Zona Sur          2.169872e-157
state__Buenos Aires Costa Atlántica    5.404275e-135
state__Buenos Aires Interior            9.702410e-62
state__Capital Federal                 7.725530e-185
state__Córdoba                          0.000000e+00
state__Santa Fe                        8.443000e-137
property_type_dummies                   0.000000e+00
pileta                                 1.322718e-178
dtype: float64
LinearRegression R2 values
R2 train: 0.582
R2 test : 0.599


In [36]:
data_final['place_name'].value_counts()

Córdoba                         3662
Mar del Plata                   3339
Rosario                         3216
Tigre                           2425
Nordelta                        2397
                                ... 
Villa Lynch                        9
Villa Real                         9
Mar del Tuyú                       7
Buenos Aires Costa Atlántica       6
San Roque                          1
Name: place_name, Length: 278, dtype: int64

In [39]:
# determino el "peso" de los primeros "lugares" o "zonas", más del 90% de los datos
rows = 120
sum(data_final['place_name'].value_counts()[ :rows ])/data_final.shape[0]

0.9112910590885567

In [41]:
# necesito una lista con los primeros lugares
places = data_final['place_name'].value_counts().index[ :rows ]

In [42]:
# imputo np:NaN a todos los lugares que no están en la lista de los primeros lugares
data_final['place_name'] = data_final['place_name'].apply( lambda x : x if x in places else np.NaN )

In [43]:
# suprimo las observaciones que no me sirven
data_final = data_final.dropna(axis=0, how='any', subset=['place_name'])

In [44]:
# me queda un dataset con ...
data_final.shape

(59870, 18)

In [45]:
data_final['place_name'].value_counts()

Córdoba             3662
Mar del Plata       3339
Rosario             3216
Tigre               2425
Nordelta            2397
                    ... 
San Andres            88
Wilde                 88
Jose Marmol           88
Parque Chacabuco      87
Plaza Mitre           86
Name: place_name, Length: 120, dtype: int64

In [47]:
place_name_dummies = pd.get_dummies(data = data_final['place_name'], prefix = 'place_', drop_first = True)

In [48]:
data_final = pd.concat([data_final, place_name_dummies], axis = 1)

In [49]:
X = data_final[['surface_covered_in_m2'] + list(place_name_dummies.columns) + ['property_type_dummies', 'pileta']]
y = data_final['price_aprox_usd']

stats_OLS(X, y)

train_LinearRegression(X, y)

statsmodels OLS pvalues
const                       3.802665e-47
surface_covered_in_m2       0.000000e+00
place__Adrogué              1.289020e-18
place__Almagro              1.035971e-17
place__Avellaneda           1.512875e-32
                               ...      
place__Villa del Parque     3.881589e-10
place__Wilde                5.927151e-17
place__otro_place           1.085807e-57
property_type_dummies      8.235075e-291
pileta                      3.336109e-71
Length: 123, dtype: float64
LinearRegression R2 values
R2 train: 0.724
R2 test : 0.715
