# Apartment price prediction with Keras and Torch

## Libraries and settings

In [120]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

## Import the apartment data

In [121]:
# Read and select variables
df_orig = pd.read_csv("apartments_data_enriched_with_new_features.csv", sep=",", encoding='utf-8')

# Remove missing values
df = df_orig.dropna()
# Remove duplicates
df = df.drop_duplicates()

print(df.shape)
df.head(10)

(2344, 45)


Unnamed: 0.1,Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,...,Kreis 10,Kreis 4,Kreis 1,Kreis 9,Kreis 5,Kreis 7,Kreis 3,Kreis 2,Kreis 8,zurich_city
0,0,261,4.5,148,4180,8050,"Schaffhauserstrasse 363, 8050 Zürich",Zürich,««Renovierte 4.5-Zimmerwohnung an zentraler La...,Zürich,...,0,0,0,0,0,0,0,0,0,1
1,1,261,2.0,122,3190,8050,"Max Bill Platz 5, 8050 Zürich",Zürich,«Modernes Wohnen im Zentrum von Oerlikon»,Zürich,...,0,0,0,0,0,0,0,0,0,1
2,2,261,3.5,78,2780,8050,"Regensbergstrasse 30, 8050 Zürich",Zürich,«Wohnen am Puls von Zürich»,Zürich,...,0,0,0,0,0,0,0,0,0,1
3,3,261,3.5,69,3750,8050,"Dörflistrasse 112, 8050 Zürich",Zürich,"«Trendy Oerlikon 3.5 Room w/ Balcony, 1min to ...",Zürich,...,0,0,0,0,0,0,0,0,0,1
4,4,261,3.5,74,2390,8050,"Schaffhauserstrasse 445, 8050 Zürich",Zürich,«Helle und zentrale Wohnung in Zürich-Oerlikon»,Zürich,...,0,0,0,0,0,0,0,0,0,1
5,5,261,4.5,98,3800,8050,"Eggbühlstrasse 9, 8050 Zürich",Zürich,«An bester Wohnlage in Zürich-Oerlikon»,Zürich,...,0,0,0,0,0,0,0,0,0,1
6,6,261,2.5,60,2460,8050,"Leutschenbachstrasse 46, 8050 Zürich",Zürich,"«Schicke Oerlikon 2.5ZW mit Keller und W/T, na...",Zürich,...,0,0,0,0,0,0,0,0,0,1
7,7,261,2.5,64,2640,8050,"Siewerdtstrasse 19, 8050 Zürich",Zürich,«Erstvermietung - Stilvoll wohnen im Herzen vo...,Zürich,...,0,0,0,0,0,0,0,0,0,1
8,8,261,3.5,92,4210,8050,"Siewerdtstrasse 19, 8050 Zürich",Zürich,«Attikawohnung Erstvermietung - Stilvoll wohne...,Zürich,...,0,0,0,0,0,0,0,0,0,1
9,9,261,3.5,85,3130,8050,"Siewerdtstrasse 19, 8050 Zürich",Zürich,«Erstvermietung - Stilvoll wohnen im Herzen vo...,Zürich,...,0,0,0,0,0,0,0,0,0,1


In [122]:
# Meaning of variables:
# bfs_number: official municipality id
# bfs_name: official municipality name
# pop: number of residents (=population)
# pop_dens: population density (pop per km2)
# frg_pct: percentage foreigners
# emp: numer of employees

df.columns

Index(['Unnamed: 0', 'bfs_number', 'rooms', 'area', 'price', 'postalcode',
       'address', 'town', 'description_raw', 'bfs_name', 'pop', 'pop_dens',
       'frg_pct', 'emp', 'tax_income', 'lat', 'lon', 'x', 'y', 'room_per_m2',
       'price_per_m2', 'luxurious', 'temporary', 'furnished', 'area_cat',
       'area_cat_ecoded', '(LOFT)', '(POOL)', '(ATTIKA)', '(EXKLUSIV)',
       '(SEESICHT)', '(LUXURIÖS)', 'Kreis 6', 'Kreis 11', 'Kreis 12',
       'Kreis 10', 'Kreis 4', 'Kreis 1', 'Kreis 9', 'Kreis 5', 'Kreis 7',
       'Kreis 3', 'Kreis 2', 'Kreis 8', 'zurich_city'],
      dtype='object')

In [123]:
# Create train and test samples
features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'room_per_m2', 'luxurious', 'temporary', 'furnished', 'area_cat_ecoded', 'zurich_city']

X_train, X_test, y_train, y_test = train_test_split(df[features], 
                                                         df['price'], 
                                                        test_size=0.20, 
                                                        random_state=42)

<hr >

## Sklearn

In [124]:
# train random_forest_model = RandomForestRegressor()
random_forest_model = RandomForestRegressor(random_state=42)

# Fit the model
random_forest_model.fit(X_train, y_train)
print("Train score: ", random_forest_model.score(X_train, y_train))
print("Test score: ", random_forest_model.score(X_test, y_test))

print("Train RMSE: ", root_mean_squared_error(y_train, random_forest_model.predict(X_train)))
print("Test RMSE: ", root_mean_squared_error(y_test, random_forest_model.predict(X_test)))

Train score:  0.9437123170192876
Test score:  0.6580388522714511
Train RMSE:  255.7095825812701
Test RMSE:  638.1863870741726


<hr >

## Keras

In [125]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

def keras_regression_model(features, df):
    df = df.sample(frac=1, random_state=42)
    X, y = df[features].values, df['price'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = Sequential([
        Dense(64, activation='relu', input_dim=X_train.shape[1]),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    model.fit(X_train, y_train, epochs=50, batch_size=512)
    loss, mae_train = model.evaluate(X_train, y_train, verbose=0)
    print("Train Mean Absolute Error:", mae_train)
    loss, mae = model.evaluate(X_test, y_test, verbose=0)
    print("Test Mean Absolute Error:", mae)


In [126]:
# Without new features the performance is: AVG RMSE: -862.86

features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'room_per_m2', 'luxurious', 'temporary', 'furnished']
keras_regression_model(features, df)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 858800256.0000 - mae: 27194.4395   
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 192013488.0000 - mae: 12503.3447 
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 227102432.0000 - mae: 13467.8086 
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 125594288.0000 - mae: 9598.0938  
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 21454528.0000 - mae: 4257.7627 
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 36028016.0000 - mae: 5090.1831 
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 30748010.0000 - mae: 3888.5276 
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 5474939.5000 - mae: 1772.9971 
Epoch 9/50
[1m4/4[0m [3

<hr >

## Tourch

In [127]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

def pytorch_regression_model(features, df):
    df = df.sample(frac=1, random_state=42)
    X, y = df[features].values, df['price'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

    model = RegressionModel(X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(100):
        optimizer.zero_grad()
        predictions = model(X_train)
        loss = criterion(predictions, y_train)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        train_predictions = model(X_train)
        train_loss = criterion(train_predictions, y_train).item()
        test_predictions = model(X_test)
        test_loss = criterion(test_predictions, y_test).item()

    print("Train Loss (RMSE):", np.sqrt(train_loss))
    print("Test Loss (RMSE):", np.sqrt(test_loss))

In [128]:
pytorch_regression_model(features, df)

Train Loss (RMSE): 958.7979453461506
Test Loss (RMSE): 1024.358274726182


In [129]:
import joblib
joblib.dump(random_forest_model, 'apartment_price_model.pkl')


['apartment_price_model.pkl']

In [130]:
import joblib
import numpy as np
import gradio as gr

# Load your trained model
model = joblib.load("apartment_price_model.pkl")

# Define the Gradio prediction function
def predict_price(rooms, area, pop, pop_dens, frg_pct, emp, tax_income,
                  room_per_m2, luxurious, temporary, furnished, area_cat_ecoded, zurich_city):
    
    features = [[rooms, area, pop, pop_dens, frg_pct, emp, tax_income,
                 room_per_m2, luxurious, temporary, furnished, area_cat_ecoded, zurich_city]]

    prediction = model.predict(features)
    return round(prediction[0], 2)

# Launch the Gradio interface
iface = gr.Interface(
    fn=predict_price,
    inputs=[
        gr.Number(label="Rooms"),
        gr.Number(label="Area (m²)"),
        gr.Number(label="Population"),
        gr.Number(label="Population Density"),
        gr.Number(label="Foreigner Percentage"),
        gr.Number(label="Employment Rate"),
        gr.Number(label="Tax Income"),
        gr.Number(label="Rooms per m²"),
        gr.Checkbox(label="Luxurious"),
        gr.Checkbox(label="Temporary"),
        gr.Checkbox(label="Furnished"),
        gr.Number(label="Area Category (Encoded)"),
        gr.Checkbox(label="Zurich City")
    ],
    outputs=gr.Number(label="Predicted Apartment Price (CHF)"),
    title="Apartment Price Prediction with Random Forest",
    examples=[
        [3, 80, 10000, 3000, 0.2, 5000, 100000, 1, 0, 0, 1, 1, 0],
        [4, 100, 20000, 5000, 0.3, 10000, 150000, 1, 1, 0, 0, 0, 1]
    ]
)

iface.launch()


* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


