In [1]:
%%capture
!pip install tensorflow=='2.3.0' keras=='2.3.1' numpy=='1.18.5'
!pip install ydata-profiling --upgrade

# Import libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU
import keras

import tensorflow as tf
import numpy as np
import os
import random
import pandas as pd

import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

%matplotlib inline
import matplotlib as mpl
mpl.rc("figure, fgsize=(16, 9)")

In [2]:
# Método generado para permitir que todas las ejecuciones terminen en una misma ejecución
def do_seeds(sn):
    os.environ['PYTHONHASHSEED']=str(sn)
    np.random.seed(sn)
    tf.random.set_seed(sn)
    random.seed(sn)

do_seeds(0)

**1. Cargar Ficheros**

In [3]:
#Import datasets
train_df = pd.read_csv('/content/train_v2.csv', sep=';')
test_df = pd.read_csv('/content/test_v2.csv', sep=';')

2. **Análisis** **exploratorio**

Detectar valores faltantes, duplicados y obtener estadísticas de las variables

In [4]:
# AE Train
print(train_df.shape)
train_df.head()

(999, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
profile = ProfileReport(train_df)
profile.to_notebook_iframe()
profile.to_file("report_train.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

El dataset cuenta con 81 variables de las cuales 29 son numerícas, 50 categorícas, 1 booleana y 1 de texto con un total de 999 observaciones. A su vez, tiene 5340 valores faltantes, lo que representa el 6,6% del total de los datos


In [6]:
#AE Test
print(test_df.shape)
test_df.head()

(461, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1000,20,RL,64.0,6762,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2010,WD,Normal,206000
1,1001,20,RL,74.0,10206,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2009,WD,Normal,82000
2,1002,30,RL,60.0,5400,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2007,WD,Abnorml,86000
3,1003,20,RL,75.0,11957,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2008,WD,Normal,232000
4,1004,90,RL,,11500,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2007,WD,Normal,136905


In [None]:
profile = ProfileReport(test_df)
profile.to_notebook_iframe()
profile.to_file("report_test.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

El dataset cuenta con 81 variables de las cuales 30 son numerícas, 49 categorícas, 1 booleana y 1 de texto con un total de 461 observaciones. A su vez, tiene 2489 valores faltantes, lo que representa el 6,7% del total de los datos

3**. Ingeniería de variables: Crear una variable**

In [8]:
# Crear una nueva variable en base a la variable "OverallQual"
def categorize_quality(x):
    if x <= 3:
        return 'Baja'
    elif 4 <= x <= 7:
        return 'Media'
    else:
        return 'Alta'

train_df['QualCategory'] = train_df['OverallQual'].apply(categorize_quality)
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,QualCategory
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,Media
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,Media
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,Media
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,Media
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,Alta


En base a la variable "OverallQual" se entiende que los usuarios estan calificando  la calidad de una vivienda en una escala de 1 a 10. Así que, se ha creado una variable que nos ayude a identificar si la clasifican como Baja, Media o Alta en base a sus calificaciones.

**4. Eliminar las variables de entrada no numéricas**

In [10]:
train_df_2 = train_df.select_dtypes(include=['number'])
test_df_2 = test_df.select_dtypes(include=['number'])

#Eliminar valores faltantes
print(train_df.shape, test_df.shape)
train_df_2= train_df_2.dropna()
test_df_2 = test_df_2.dropna()
print("Después de eliminar:", train_df_2.shape, test_df_2.shape)

(999, 82) (461, 81)
Después de eliminar: (770, 38) (351, 38)


**5. Eliminar la(s) variable(s) de entrada que no tengan sentido lógico para realizar la predicción**

In [11]:
# Definir variable objetivo
target= "SalePrice"

# Correlación con la variable objetivo
correlations = train_df_2.corr()[target]

print("\nCorrelaciones con la variable objetivo:")
print(correlations)

# Eliminar columnas que tienen baja correlación con la variable objetivo
correlation_threshold = 0.3
columns_to_drop = correlations[abs(correlations) < correlation_threshold].index
train_df_2_filtered = train_df_2.drop(columns=columns_to_drop)

print(train_df_2_filtered.shape)


Correlaciones con la variable objetivo:
Id              -0.020947
MSSubClass      -0.085480
LotFrontage      0.360597
LotArea          0.301179
OverallQual      0.807692
OverallCond     -0.141508
YearBuilt        0.523545
YearRemodAdd     0.535107
MasVnrArea       0.511948
BsmtFinSF1       0.402606
BsmtFinSF2      -0.003554
BsmtUnfSF        0.214621
TotalBsmtSF      0.652351
1stFlrSF         0.639910
2ndFlrSF         0.312787
LowQualFinSF    -0.006898
GrLivArea        0.742898
BsmtFullBath     0.240524
BsmtHalfBath    -0.038340
FullBath         0.561653
HalfBath         0.295058
BedroomAbvGr     0.165285
KitchenAbvGr    -0.132602
TotRmsAbvGrd     0.594443
Fireplaces       0.479970
GarageYrBlt      0.514611
GarageCars       0.674714
GarageArea       0.658847
WoodDeckSF       0.350523
OpenPorchSF      0.367164
EnclosedPorch   -0.150697
3SsnPorch       -0.004179
ScreenPorch      0.140703
PoolArea         0.009901
MiscVal         -0.091970
MoSold           0.034613
YrSold          -0.0246

In [12]:
# Correlación con la variable objetivo
correlations = test_df_2.corr()[target]

print("\nCorrelaciones con la variable objetivo:")
print(correlations)

# Eliminar columnas que tienen baja correlación con la variable objetivo
correlation_threshold = 0.3
columns_to_drop = correlations[abs(correlations) < correlation_threshold].index
test_df_2_filtered = test_df_2.drop(columns=columns_to_drop)

print(test_df_2_filtered.shape)


Correlaciones con la variable objetivo:
Id               0.024229
MSSubClass      -0.093189
LotFrontage      0.314282
LotArea          0.324934
OverallQual      0.774356
OverallCond     -0.090085
YearBuilt        0.531544
YearRemodAdd     0.490418
MasVnrArea       0.416585
BsmtFinSF1       0.369783
BsmtFinSF2      -0.094790
BsmtUnfSF        0.208650
TotalBsmtSF      0.545764
1stFlrSF         0.544946
2ndFlrSF         0.292448
LowQualFinSF     0.010477
GrLivArea        0.629546
BsmtFullBath     0.231990
BsmtHalfBath    -0.031560
FullBath         0.583203
HalfBath         0.200083
BedroomAbvGr     0.175104
KitchenAbvGr    -0.160716
TotRmsAbvGrd     0.443838
Fireplaces       0.421332
GarageYrBlt      0.482541
GarageCars       0.582517
GarageArea       0.530974
WoodDeckSF       0.304695
OpenPorchSF      0.281316
EnclosedPorch   -0.163910
3SsnPorch        0.150122
ScreenPorch      0.044389
PoolArea         0.216468
MiscVal          0.057467
MoSold           0.091502
YrSold           0.0163

**6. Normalizar variables de entrada mediante min-max**

In [13]:
# Se separan las variables explicativas de las variables a predecir
from sklearn.model_selection import train_test_split

X = train_df_2_filtered.drop(target, axis=1)
y = train_df_2_filtered[target].to_numpy().reshape((-1,1))

X_test = test_df_2_filtered.drop(target, axis=1)
y_test = test_df_2_filtered[target].to_numpy().reshape((-1,1))

In [14]:
# Convertir a DataFrames
X = pd.DataFrame(train_df_2_filtered)
X_test = pd.DataFrame(test_df_2_filtered)

# Alinear columnas del conjunto de prueba con el conjunto de entrenamiento
X_test_alineado = X_test.reindex(columns=X.columns, fill_value=0)

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Se transforman las variables explicativas
scaler_x = MinMaxScaler()
scaler_x.fit(X)
X = scaler_x.transform(X)
X_test = scaler_x.transform(X_test_alineado)

In [16]:
# Se comprueba que todas las variables están en el rango de valores deseado
np.min(X, axis=0), np.max(X, axis=0)

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1.]))

In [17]:
# Se transforman la variable a predecir (Target)
scaler_y = MinMaxScaler()
scaler_y.fit(y)
y = scaler_y.transform(y)
y_test = scaler_y.transform(y_test)

In [18]:
# Se comprueba que todas las variables están en el rango de valores deseado
np.min(y, axis=0), np.max(y, axis=0)

(array([0.]), array([1.]))

**7. Dividir los datos de entrenamiento en Train (80%) y Validation (20%)**

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print('X Original:',X.shape,'; X train:',X_train.shape,'; X test:',X_val.shape)

X Original: (770, 20) ; X train: (616, 20) ; X test: (154, 20)


***8. Crear una Red Neuronal con 2 capas ocultas, 200 neuronas en cada capa y función de activación ReLu ***

**Nota**: La siguiente Red Neuronal tiene:
- Input: 20 datos
- Hidden Layer 1: 200 neuronas
- Hidden Layer 2: 200 neuronas
- Output Layer: 1 neurona (rango de valores posibles: -infinito a +infinito)

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import to_categorical

model = Sequential()
model.add(Dense(200, input_shape=(20,), activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation=None))

**9. Entrenar el algoritmo utilizando la métrica RMSE como función de coste**

In [21]:
model.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])

model.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f35e4a98a0>

In [22]:
#Reentrar modelo para evitar predicciones con precios inferiores a 0
model2 = Sequential()
model2.add(Dense(200, input_shape=(20,), activation='relu'))
model2.add(Dense(200, activation='relu'))
model2.add(Dense(1, activation=None))

model2.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])
model2.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f35dcc4fa0>

**10. Evaluar la predicción en Test**

In [23]:
print('[RMSE, Mean Absolute Error]')
model.evaluate(X_test,y_test)

[RMSE, Mean Absolute Error]


[0.0009656504262238741, 0.017999224364757538]

**11. Crear una arquitectura que produzca overfit**

In [25]:
model_overfit = Sequential()
model_overfit.add(Dense(200, input_shape=(20,), activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(200, activation='relu'))
model_overfit.add(Dense(1, activation='relu'))

model_overfit.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])
model_overfit.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f37b195a80>

**12. Probar 3 ejemplos con distintas regularizaciones y identificar la que mejor funciona**

In [26]:
# Regularizador L1
from tensorflow.keras import regularizers

model_reg1 = Sequential()
model_reg1.add(Dense(200, input_shape=(20,), activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.02)))
model_reg1.add(Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.02))),
model_reg1.add(Dense(1, activation='relu'))

model_reg1.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])
model_reg1.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f37b17bbb0>

In [27]:
#Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout

model_reg2 = Sequential()
model_reg2.add(Dense(200, input_shape=(20,), activation='relu'))
model_reg2.add(Dropout(0.5))
model_reg2.add(Dense(200, activation='relu'))
model_reg2.add(Dropout(0.5))
model_reg2.add(Dense(1, activation='relu'))

model_reg2.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])
model_reg2.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f37f7a72e0>

In [30]:
#BatchNormalization
from tensorflow.keras.layers import BatchNormalization

model_reg3 = Sequential()
model_reg3.add(Dense(200, input_shape=(20,), activation='relu'))
model_reg3.add(BatchNormalization())
model_reg3.add(Dense(200, activation='relu'))
model_reg3.add(BatchNormalization())
model_reg3.add(Dense(1, activation='relu'))

model_reg3.compile(loss='mse', optimizer='adam', metrics=['mean_absolute_error'])
model_reg3.fit(X_train,y_train, epochs=7, validation_data=(X_val, y_val), batch_size=32)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x78f37b080a90>

El mejor modelo es que el tiene el menor error en los datos de validación, no entrenamiento.  En este caso, el Dropout es el mejor modelo entre los 3, el modelo que presenta el mejor rendimiento en cuanto a la menor pérdida y el menor error absoluto medio:
- Validación Loss Final: 0.0084
- Validación MAE Final: 0.0675
