In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('urbania_data_clean.csv')

# Dropeando Columnas que no aportan al modelo

In [4]:
df = df.drop(columns =["Descripcion", "Anunciante", "Fecha_pub", "latitud", "longitud"], axis = 1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7644 entries, 0 to 7643
Columns: 7295 entries, Antiguedad to Area_total_m2_bins_(9.519, 10.657]
dtypes: float64(10), int64(7284), object(1)
memory usage: 425.4+ MB


# Convirtiendo las variables categoricas a numericas

In [6]:
# Luego de revisar cada columna..

## NUMERICAS
num_cols_originales = [
    'Antiguedad',
    'NroBanios',
    'Nro_pisos',
    'Cocheras',
    'Dormitorios',
    #'fecha_publicacion',
    'Precio',
    'Area_constr_m2',
    'Area_total_m2'

]

num_cols = [
    'Antiguedad',
    'NroBanios',
    'Nro_pisos',
    'Cocheras',
    'Dormitorios',
    #'fecha_publicacion',
    'Precio',
    'Area_constr_m2',
    'Area_total_m2',
    "ratioConstruccion",
    "ratioBanios",
    "ratioDormitorios"

]

In [7]:
df = df.sort_values(by = 'fecha_publicacion')

In [36]:
df.select_dtypes('object').columns

Index(['fecha_publicacion'], dtype='object')

In [8]:
df.select_dtypes('number').columns

Index(['Antiguedad', 'NroBanios', 'Nro_pisos', 'Cocheras', 'Dormitorios',
       'Precio', 'Area_constr_m2', 'Area_total_m2', 'Balneario_IMP',
       'Estado de Inmueble_IMP',
       ...
       'Area_total_m2_bins_(-0.739, 0.411]',
       'Area_total_m2_bins_(0.411, 1.549]',
       'Area_total_m2_bins_(1.549, 2.688]',
       'Area_total_m2_bins_(2.688, 3.826]',
       'Area_total_m2_bins_(3.826, 4.965]',
       'Area_total_m2_bins_(4.965, 6.103]',
       'Area_total_m2_bins_(6.103, 7.242]', 'Area_total_m2_bins_(7.242, 8.38]',
       'Area_total_m2_bins_(8.38, 9.519]',
       'Area_total_m2_bins_(9.519, 10.657]'],
      dtype='object', length=7294)

## DF con solo variables numericas originales

In [9]:
df_num_org = df[num_cols_originales]

## DF con solo variables numericas

In [10]:
df_num = df[num_cols]

## DF con variables numericas y categoricas

In [11]:
df_num_cat = df.drop(columns = ["fecha_publicacion"])

# Separando dataset en val, test, train

In [12]:
def train_test_val_split(df):
    df_val = df.iloc[int(df.shape[0]*0.9):]
    df_training = df.iloc[:int(df.shape[0]*0.9)]
    df_train, df_test = train_test_split(df_training, random_state = 0, test_size = 0.2)
    return df_train, df_test, df_val

In [13]:
def x_y_split(df, target):
    x =  df.drop(columns = target)
    y = df[target].values
    return x, y

## Modelo 1: variables numericas originales

In [14]:
df_train, df_test, df_val = train_test_val_split(df_num_org)

In [15]:
x_train, y_train = x_y_split(df_train, "Precio")
x_test, y_test = x_y_split(df_test, "Precio")
x_val, y_val = x_y_split(df_val, "Precio")

In [16]:
model = LinearRegression()

In [17]:
model.fit(x_train, y_train)

LinearRegression()

In [18]:
print("train: ",model.score(x_train, y_train))
print("test: ",model.score(x_test, y_test))
print("val: ",model.score(x_val, y_val))

train:  0.33703055401273463
test:  0.3258099317251195
val:  0.3067394073643902


## Modelo 2: variables numericas

In [19]:
df_train, df_test, df_val = train_test_val_split(df_num)

In [20]:
x_train, y_train = x_y_split(df_train, "Precio")
x_test, y_test = x_y_split(df_test, "Precio")
x_val, y_val = x_y_split(df_val, "Precio")

In [21]:
model = LinearRegression()

In [22]:
model.fit(x_train, y_train)

LinearRegression()

In [23]:
print("train: ",model.score(x_train, y_train))
print("test: ",model.score(x_test, y_test))
print("val: ",model.score(x_val, y_val))

train:  0.34338096155426634
test:  0.32270988799164557
val:  0.3014237880238675


## Model 3: variables numericas normalizando precio

In [24]:
df_train, df_test, df_val = train_test_val_split(df_num)

In [25]:
x_train, y_train = x_y_split(df_train, "Precio")
x_test, y_test = x_y_split(df_test, "Precio")
x_val, y_val = x_y_split(df_val, "Precio")

In [26]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)
y_val = np.log1p(y_val)

In [27]:
model = LinearRegression()

In [28]:
model.fit(x_train, y_train)

LinearRegression()

In [29]:
print("train: ",model.score(x_train, y_train))
print("test: ",model.score(x_test, y_test))
print("val: ",model.score(x_val, y_val))

train:  0.4605155258008512
test:  0.4102120881652177
val:  0.44754217946447317


## Model 4: variables numericas y categoricas

In [37]:
df_train, df_test, df_val = train_test_val_split(df_num_cat)

In [38]:
x_train, y_train = x_y_split(df_train, "Precio")
x_test, y_test = x_y_split(df_test, "Precio")
x_val, y_val = x_y_split(df_val, "Precio")

In [39]:
model = LinearRegression()

In [40]:
model.fit(x_train, y_train)

LinearRegression()

In [41]:
print("train: ",model.score(x_train, y_train))
print("test: ",model.score(x_test, y_test))
print("val: ",model.score(x_val, y_val))

train:  0.9503565037561439
test:  -1.5423831233856188e+16
val:  -1.2104632580376532e+16


## Model 5: variables numericas y categoricas normalizando precio

In [42]:
df_train, df_test, df_val = train_test_val_split(df_num_cat)

In [43]:
x_train, y_train = x_y_split(df_train, "Precio")
x_test, y_test = x_y_split(df_test, "Precio")
x_val, y_val = x_y_split(df_val, "Precio")

In [44]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)
y_val = np.log1p(y_val)

In [45]:
model = LinearRegression()

In [46]:
model.fit(x_train, y_train)

LinearRegression()

In [47]:
print("train: ",model.score(x_train, y_train))
print("test: ",model.score(x_test, y_test))
print("val: ",model.score(x_val, y_val))

train:  0.9936022821410706
test:  -1.8775526730115772e+16
val:  -2.484231073898019e+16
