In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import common.common_machine_learning as common

#pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [29]:
def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

## Limpieza de datasets

### Elijo columnas basándome en la importancia según Univariate

In [30]:
TARGET = 'precio'
columnas = ["tipodepropiedad", "provincia", "antiguedad", "metroscubiertos",
           "metrostotales", "gimnasio", "usosmultiples", "piscina", "escuelascercanas", "centroscomercialescercanos", "precio"]

In [31]:
train = pd.read_csv('sets_de_datos/train.csv', index_col = 0)
test = pd.read_csv('sets_de_datos/test.csv', index_col = 0)

### Limpio el set train: elimino columnas con nans, relleno con promedios, etc

In [32]:
train = train[columnas]

In [33]:
train.isna().sum()

tipodepropiedad                  46
provincia                       155
antiguedad                    43555
metroscubiertos               17400
metrostotales                 51467
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
precio                            0
dtype: int64

In [34]:
train["antiguedad"].fillna(train["antiguedad"].mean(),inplace=True)
train["metroscubiertos"].fillna(train["metroscubiertos"].mean(), inplace=True)
train["metrostotales"].fillna(train["metrostotales"].mean(), inplace=True)
train["precio"].fillna(train["precio"].mean(), inplace=True)
train.dropna(subset=["tipodepropiedad"], inplace=True)
train.fillna(0, inplace=True)

#filtro la que no están en test para que train y test tengan las mismas columnas tras el encoding
train["tipodepropiedad"] = train[train["tipodepropiedad"].isin(test["tipodepropiedad"].unique())]
train.dropna(subset=["tipodepropiedad"], inplace=True)
train['provincia'] = train['provincia'].str.replace("provincia_0","nan")
#train["provincia"] = train[train["provincia"] != "Jalisco"]

In [35]:
train["tipodepropiedad"].value_counts().count()

22

In [36]:
test["tipodepropiedad"].value_counts().count()

22

In [37]:
train["provincia"].value_counts().count()

32

In [38]:
test["provincia"].value_counts().count()

32

In [39]:
train["provincia"].value_counts()

Distrito Federal         58789
Edo. de México           41604
Jalisco                  21238
Querétaro                16988
Nuevo León               15324
Puebla                   10421
San luis Potosí           8411
Yucatán                   7925
Morelos                   7337
Veracruz                  5760
Quintana Roo              4756
Chihuahua                 4590
Coahuila                  3695
Baja California Norte     3220
Sonora                    2988
Guanajuato                2860
Guerrero                  2678
Hidalgo                   2521
Michoacán                 2471
Tamaulipas                2303
Durango                   2275
Sinaloa                   1806
Aguascalientes            1753
Baja California Sur       1700
Nayarit                   1352
Chiapas                   1126
Colima                    1008
Tabasco                    994
Tlaxcala                   839
Oaxaca                     711
Campeche                   262
Zacatecas                   94
Name: pr

In [40]:
train.isna().sum()

tipodepropiedad                 0
provincia                     153
antiguedad                      0
metroscubiertos                 0
metrostotales                   0
gimnasio                        0
usosmultiples                   0
piscina                         0
escuelascercanas                0
centroscomercialescercanos      0
precio                          0
dtype: int64

In [41]:
train_OHE  = pd.get_dummies(train)

In [42]:
train_OHE.head(5)

Unnamed: 0_level_0,antiguedad,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,tipodepropiedad_Apartamento,...,provincia_Quintana Roo,provincia_San luis Potosí,provincia_Sinaloa,provincia_Sonora,provincia_Tabasco,provincia_Tamaulipas,provincia_Tlaxcala,provincia_Veracruz,provincia_Yucatán,provincia_Zacatecas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254099,8.116114,80.0,80.0,0.0,0.0,0.0,0.0,0.0,2273000.0,1,...,0,0,0,0,0,0,0,0,0,0
53461,10.0,268.0,180.0,0.0,0.0,0.0,1.0,1.0,3600000.0,0,...,0,0,0,0,0,0,0,0,0,0
247984,5.0,144.0,166.0,0.0,0.0,0.0,0.0,0.0,1200000.0,0,...,0,0,0,0,0,0,0,0,0,0
209067,1.0,63.0,67.0,0.0,0.0,0.0,1.0,1.0,650000.0,0,...,0,0,0,0,0,0,0,0,0,0
185997,10.0,95.0,95.0,0.0,0.0,0.0,0.0,0.0,1150000.0,1,...,0,0,0,0,0,0,0,0,0,0


### Limpio el set test: relleno con promedios y moda

In [43]:
columnas_test = ["tipodepropiedad", "provincia", "antiguedad", "metroscubiertos",
                 "metrostotales", "gimnasio", "usosmultiples", "piscina", "escuelascercanas", "centroscomercialescercanos"]
test = test[columnas_test]

In [44]:
test.isna().sum()

tipodepropiedad                   7
provincia                        42
antiguedad                    10714
metroscubiertos                4299
metrostotales                 12655
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
dtype: int64

In [45]:
#ciudad = test["ciudad"].mode()
#tipodepropiedad = test["tipodepropiedad"].mode()

test = test.fillna(value = {'tipodepropiedad' : test["tipodepropiedad"].mode().to_string(),
                            'provincia' : test["provincia"].mode().to_string()})
#test["tipodepropiedad"].fillna(test["ciudad"].mode(), inplace=True)
#est["ciudad"].fillna(test["ciudad"].mode(), inplace=True)
test["antiguedad"].fillna(test["antiguedad"].mean(), inplace=True)
test["metroscubiertos"].fillna(test["metroscubiertos"].mean(), inplace=True)
test["metrostotales"].fillna(test["metrostotales"].mean(), inplace=True)

test['provincia'] = test['provincia'].str.replace('0    Distrito Federal','Distrito Federal') #Sino, explota el universo >:[
test['tipodepropiedad'] = test['tipodepropiedad'].str.replace('0    Casa',"Casa")

In [46]:
test.isna().sum()

tipodepropiedad               0
provincia                     0
antiguedad                    0
metroscubiertos               0
metrostotales                 0
gimnasio                      0
usosmultiples                 0
piscina                       0
escuelascercanas              0
centroscomercialescercanos    0
dtype: int64

In [47]:
X = train_OHE.drop([TARGET], axis = 1).copy().values
y = list(train_OHE[TARGET].copy())

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [49]:
mejor_rf = RandomForestRegressor(n_estimators = 200, min_samples_split = 5, min_samples_leaf = 1, max_features = "sqrt",
                                max_depth = 50)
mejor_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [50]:
evaluar_rf(mejor_rf, X_test, y_test)

Performance del modelo:
Average Error: 733546.7489 degrees.
Accuracy = 64.35%.


64.34786228650566

In [51]:
train["tipodepropiedad"].value_counts().count()

22

In [52]:
test["tipodepropiedad"].value_counts().count()

22

In [53]:
test_OHE = pd.get_dummies(test)

In [54]:
y_pred = mejor_rf.predict(test_OHE)

In [55]:
df = pd.DataFrame({"id": test.index, "target": y_pred})
#df = df.head(60000)
#df = df.astype({'id': 'int32', 'target': 'int32'})

In [56]:
filename = 'RF_submit.csv'

df.to_csv(filename,index=False)