In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import common.common_machine_learning as common

#pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [10]:
def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [11]:
TARGET = 'precio'
columnas = ["tipodepropiedad", "ciudad", "antiguedad", "metroscubiertos",
           "metrostotales", "gimnasio", "usosmultiples", "piscina", "escuelascercanas", "centroscomercialescercanos", "precio"]

In [12]:
train = pd.read_csv('sets_de_datos/train.csv', index_col = 0)
test = pd.read_csv('sets_de_datos/test.csv', index_col = 0)

In [13]:
train = train[columnas]
train["antiguedad"].fillna(train["antiguedad"].mean(),inplace=True)
train["metroscubiertos"].fillna(train["metroscubiertos"].mean(), inplace=True)
train["metrostotales"].fillna(train["metrostotales"].mean(), inplace=True)
train["precio"].fillna(train["precio"].mean(), inplace=True)
train.dropna(subset=["tipodepropiedad", "ciudad"], inplace=True)
train.fillna(0, inplace=True)

#test = train[columnas]
#test["antiguedad"].fillna(train["antiguedad"].mean(),inplace=True)
#test["metroscubiertos"].fillna(train["metroscubiertos"].mean(), inplace=True)
#test["metrostotales"].fillna(train["metrostotales"].mean(), inplace=True)
#test["precio"].fillna(train["precio"].mean(), inplace=True)
#test.dropna(subset=["tipodepropiedad", "ciudad"], inplace=True)
#test.fillna(0, inplace=True)

In [14]:
ciudades_validas = train["ciudad"].value_counts().to_frame().reset_index()
ciudades_validas.columns = ["ciudad", "cantidad"]
ciudades_validas.shape

(875, 2)

In [15]:
# Elimino las ciudades con menos de 100 casas para reducir la cantidad de col cuando se haga OHE
ciudades_validas = ciudades_validas[ciudades_validas["cantidad"] >= 100]
train = pd.merge(train, ciudades_validas, on = "ciudad", how = "inner")
train = train.drop(["cantidad"], axis = 1)
ciudades_validas.shape

(146, 2)

In [16]:
train_OHE  = pd.get_dummies(train)

In [17]:
train

Unnamed: 0,tipodepropiedad,ciudad,antiguedad,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,Apartamento,Benito Juárez,8.116114,80.0,80.000000,0.0,0.0,0.0,0.0,0.0,2273000.0
1,Casa,Benito Juárez,20.000000,227.0,185.000000,0.0,0.0,0.0,0.0,0.0,3000000.0
2,Apartamento,Benito Juárez,5.000000,127.0,127.000000,0.0,0.0,0.0,1.0,1.0,4750000.0
3,Apartamento,Benito Juárez,0.000000,98.0,176.765145,0.0,0.0,0.0,0.0,0.0,2100000.0
4,Apartamento,Benito Juárez,6.000000,160.0,176.765145,0.0,0.0,0.0,0.0,0.0,4050000.0
5,Apartamento,Benito Juárez,5.000000,122.0,122.000000,0.0,0.0,0.0,1.0,1.0,4200000.0
6,Apartamento,Benito Juárez,3.000000,120.0,176.765145,1.0,0.0,0.0,0.0,0.0,4500000.0
7,Apartamento,Benito Juárez,48.000000,75.0,75.000000,0.0,0.0,0.0,1.0,1.0,1300000.0
8,Apartamento,Benito Juárez,8.116114,86.0,86.000000,0.0,0.0,0.0,0.0,0.0,2609500.0
9,Apartamento,Benito Juárez,20.000000,120.0,120.000000,0.0,0.0,0.0,0.0,0.0,2500000.0


In [18]:
X = train_OHE.drop([TARGET], axis = 1).copy().values
y = list(train_OHE[TARGET].copy())

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [20]:
X_train

array([[ 10.        , 108.        ,  61.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        , 300.        , 370.        , ...,   0.        ,
          0.        ,   0.        ],
       [ 10.        , 160.        , 176.76514456, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  5.        , 150.        , 200.        , ...,   0.        ,
          0.        ,   0.        ],
       [  3.        , 215.        , 194.        , ...,   0.        ,
          0.        ,   0.        ],
       [ 10.        , 430.        , 176.76514456, ...,   0.        ,
          0.        ,   0.        ]])

In [21]:
mejor_rf = RandomForestRegressor(n_estimators = 200, min_samples_split = 5, min_samples_leaf = 1, max_features = "sqrt",
                                max_depth = 50)
mejor_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [22]:
evaluar_rf(mejor_rf, X_test, y_test)

Performance del modelo:
Average Error: 664556.4277 degrees.
Accuracy = 67.23%.


67.2300576704138

In [24]:
test_OHE = pd.get_dummies(test)

MemoryError: 

In [None]:
test_OHE

In [None]:
y_pred = mejor_rf.predict(X_test)

In [None]:
df = pd.DataFrame({"id": X_test[:,0], "target": y_pred})
#df = df.head(60000)
#df = df.astype({'id': 'int32', 'target': 'int32'})

In [None]:
filename = 'RF_submit.csv'

df.to_csv(filename,index=False)

In [None]:
df