In [1]:
import pandas as pd
import re
import os
import pickle

In [3]:
# Data Collection
data = pd.read_csv('../data/rent_apartments.csv')
data.head()

Unnamed: 0,address,area,constraction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,zip,neighborhood,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,yes,no,no,yes,no,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,yes,no,no,yes,no,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,no,no,no,no,no,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250


In [4]:
# Selecciona solo las columnas que deseas analizar
categorical_columns = ['balcony', 'storage', 'parking', 'furnished', 'garage']

# Aplica nunique para contar las categorías únicas en cada columna
category_counts = data[categorical_columns].nunique()
print(category_counts)

balcony      2
storage      2
parking      2
furnished    2
garage       2
dtype: int64


In [5]:
# Data Preparation
data[categorical_columns] = data[categorical_columns].map(lambda x: 1 if x == 'yes' else 0)
data

Unnamed: 0,address,area,constraction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,zip,neighborhood,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,1,0,0,1,0,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,1,0,1,1,0,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,1,0,1,1,0,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,1,0,0,1,0,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,0,0,0,0,0,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,1033 DL Amsterdam (Terrasdorp),75.0,1990,3,2,1,0,0,0,1,0,Not present,C,,1033 DL,Terrasdorp,1450
1724,1033 DZ Amsterdam (Terrasdorp),75.0,1990,3,2,1,1,0,0,1,0,Not present,C,Shower,1033 DZ,Terrasdorp,1500
1725,1021 NX Amsterdam (IJplein e.o.),74.0,1986,2,1,1,0,0,0,1,0,Not present,,,1021 NX,IJplein e.o.,1400
1726,1021 EC Amsterdam (Vogelbuurt Zuid),118.0,1920,5,4,1,1,1,1,1,0,Not present,G,"Storage space, Shower, Toilet",1021 EC,Vogelbuurt Zuid,2650


In [6]:
# Define una función para extraer el número de metros cuadrados
def extract_garden_area(value):
    match = re.search(r'(\d+)\s*m²', str(value))
    if match:
        return int(match.group(1))
    else:
        return 0  # Puedes elegir otro valor por defecto si no hay metros cuadrados
    
# Aplica la función a la columna 'garden'
data['garden'] = data['garden'].apply(extract_garden_area)

In [7]:
data = data[[col for col in data.columns if col not in ['address', 'energy', 'facilities', 'zip', 'neighborhood']]]
data

Unnamed: 0,area,constraction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,rent
0,167.0,1870,3,2,2,1,0,0,1,0,0,4500
1,150.0,1890,3,2,2,1,0,1,1,0,0,3450
2,150.0,1890,3,2,2,1,0,1,1,0,0,3450
3,90.0,1923,3,2,1,1,0,0,1,0,0,2000
4,104.0,1923,3,2,1,0,0,0,0,0,47,3250
...,...,...,...,...,...,...,...,...,...,...,...,...
1723,75.0,1990,3,2,1,0,0,0,1,0,0,1450
1724,75.0,1990,3,2,1,1,0,0,1,0,0,1500
1725,74.0,1986,2,1,1,0,0,0,1,0,0,1400
1726,118.0,1920,5,4,1,1,1,1,1,0,0,2650


In [8]:
X = data.loc[:, data.columns != 'rent']
y = data.loc[:, data.columns == 'rent'].squeeze()

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [12]:
rf.score(X_test, y_test)

0.6993776008460116

In [13]:
# Tunning Hyperparameters
from sklearn.model_selection import GridSearchCV

grid_space = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9, 12]
}

grid = GridSearchCV(RandomForestRegressor(), param_grid=grid_space, cv=5, scoring='r2')

In [14]:
model_grid = grid.fit(X_train, y_train)

In [15]:
print(f'Best hyperparameters are {model_grid.best_params_}, score = {model_grid.best_score_}')

Best hyperparameters are {'max_depth': 9, 'n_estimators': 200}, score = 0.7696886282582257


In [16]:
# Models management
# Verifica si la carpeta "models" existe; si no, créala
if not os.path.exists('../models'):
    os.makedirs('../models')

# Guarda el modelo en la carpeta "models"
with open('../models/rf_v1.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [17]:
# Cargar el modelo desde "../models"
with open('../models/rf_v1.pkl', 'rb') as file:
    rf_v1 = pickle.load(file)

In [18]:
rf_v1