In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

Data Cleaning for Sequence Model

In [2]:
df = pd.read_csv("Cleaned_Apartments.csv")

In [3]:
df = df.drop(columns=['id'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   city                  147 non-null    int64  
 1   type                  147 non-null    int64  
 2   squareMeters          147 non-null    float64
 3   rooms                 147 non-null    int64  
 4   floor                 147 non-null    int64  
 5   floorCount            147 non-null    int64  
 6   buildYear             147 non-null    int64  
 7   latitude              147 non-null    float64
 8   longitude             147 non-null    float64
 9   centreDistance        147 non-null    float64
 10  poiCount              147 non-null    int64  
 11  schoolDistance        147 non-null    float64
 12  clinicDistance        147 non-null    float64
 13  postOfficeDistance    147 non-null    float64
 14  kindergartenDistance  147 non-null    float64
 15  restaurantDistance    1

In [5]:
# Extracting sorted_prices list
df['sorted_prices'] = df['sorted_prices'].str.strip('[]').str.split(', ')

# Creating individual columns for prices
for i in range(len(df['sorted_prices'].iloc[0])):
    df[f'price_{i+1}'] = df['sorted_prices'].apply(lambda x: int(x[i]))

# Dropping the original sorted_prices column
df.drop(columns=['sorted_prices'], inplace=True)

df

Unnamed: 0,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,...,hasElevator,hasSecurity,hasStorageRoom,price_1,price_2,price_3,price_4,price_5,price_6,price_7
0,1,3,77.71,3,3,3,1925,53.433019,14.569775,1.22,...,0,0,1,550000,550000,550000,650000,599000,599000,599000
1,1,3,78.60,3,2,4,1929,53.432833,14.548187,1.35,...,0,0,1,440000,440000,440000,440000,440000,440000,440000
2,1,3,100.00,3,4,4,1930,53.436494,14.572026,1.63,...,0,0,1,799000,799000,739000,739000,739000,739000,739000
3,1,2,96.06,4,3,3,2009,53.465600,14.541539,4.87,...,0,0,0,768000,768000,799000,799000,799000,799000,799000
4,1,3,42.57,2,1,1,1938,53.495690,14.584651,8.19,...,0,0,1,389000,389000,379000,379000,379000,379000,379000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,14,1,78.90,3,2,2,2004,53.108380,17.980800,2.02,...,0,0,1,640000,640000,640000,640000,640000,640000,640000
143,14,3,39.00,2,1,4,1914,53.126000,18.007900,0.65,...,0,0,1,205000,205000,205000,205000,205000,205000,205000
144,14,2,37.30,2,10,10,1980,53.124984,18.049752,3.29,...,1,0,1,219000,219000,219000,219000,219000,219000,219000
145,14,3,95.00,4,1,3,1893,53.126000,18.007900,0.65,...,0,0,1,599000,599000,599000,599000,599000,599000,599000


Training the Data for Sequence Model

In [6]:
# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)

# Split into input (X) and output (y) variables
X = scaled_data[:, :-1]  # All columns except the last one (price_7) as input
y = scaled_data[:, -1]    # Last column (price_7) as output

# Reshape input data to 3D tensor (samples, timesteps, features)
# Assuming you want to consider previous 6 prices to predict the next one (change it as necessary)
timesteps = 6
X_reshaped = np.array([X[i:timesteps+i] for i in range(len(X)-timesteps)])
y_reshaped = y[timesteps:]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_reshaped, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))
model.add(Dense(units=1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.015911100432276726
R-squared: 0.3059586653938926


Save the Sequence Model

In [7]:
model.save('sequence_model.h5')

  saving_api.save_model(


Data Cleaning for Predicting Price Based on Features

In [8]:
import pandas as pd
df = pd.read_csv("apartments_pl/apartments_pl_2024_02.csv")

In [9]:
df = df.dropna()
df

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
2,42758bc1fd0885a6221965f2c96da033,szczecin,tenement,48.84,2.0,2.0,3.0,1950.0,53.432222,14.555833,...,0.121,condominium,brick,low,no,yes,no,no,no,389000
6,2ee4c3c202ba72a65581f019a592bc92,szczecin,apartmentBuilding,75.00,3.0,4.0,4.0,1999.0,53.427312,14.549056,...,0.111,cooperative,brick,premium,yes,yes,no,yes,yes,890000
10,1deea3ea4c2760c5f511bb9703312a78,szczecin,apartmentBuilding,48.77,2.0,5.0,7.0,2019.0,53.447464,14.557782,...,0.177,condominium,brick,premium,yes,yes,yes,no,yes,699000
11,b4581dfeed8052716c41d62f39bb9705,szczecin,blockOfFlats,61.31,3.0,3.0,3.0,2020.0,53.462647,14.544680,...,0.189,condominium,brick,premium,yes,no,yes,no,no,650000
12,76affd52069119ac305ee74b21b4aed7,szczecin,blockOfFlats,56.58,2.0,4.0,5.0,2019.0,53.439113,14.490044,...,0.258,condominium,brick,premium,yes,yes,yes,no,no,580000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16299,307a70d93015d7141669edc0e462e434,bydgoszcz,tenement,39.00,2.0,1.0,4.0,1914.0,53.126000,18.007900,...,0.205,condominium,brick,low,no,no,no,no,yes,205000
16324,da0d7d8d1ab4c296faddeb207804ed17,bydgoszcz,blockOfFlats,37.30,2.0,10.0,10.0,1980.0,53.124984,18.049752,...,0.155,cooperative,concreteSlab,low,no,yes,yes,no,yes,219000
16335,36112fd62a6c141ef83cdbca1b2c53a1,bydgoszcz,apartmentBuilding,71.72,3.0,1.0,2.0,2012.0,53.124984,18.049752,...,0.155,condominium,brick,premium,no,yes,yes,no,no,849000
16347,8b4f48a955b5c1e47dce94ceb7fd78e9,bydgoszcz,blockOfFlats,74.00,3.0,1.0,4.0,1960.0,53.105804,18.054872,...,0.277,condominium,brick,low,no,no,no,no,yes,390000


In [10]:

df['condition'] = df['condition'].map({'low': 0, 'premium': 1})
df['type'] = df['type'].map({'apartmentBuilding': 1, 'blockOfFlats': 2, 'tenement': 3})
for col in ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']:
    df[col] = df[col].map({'no': 0, 'yes': 1})

df['city'] = df['city'].map({'szczecin':1,
                             'gdynia':2,
                             'krakow':3,
                             'poznan':4,
                             'bialystok':5,
                             'gdansk':6,
                             'wroclaw':7,
                             'radom':8,
                             'rzeszow':9,
                             'katowice':10,
                             'lublin':11,
                             'czestochowa':12,
                             'warszawa':13,
                             'bydgoszcz':14
                             })
df['ownership'] = df['ownership'].map({'condominium': 0, 'cooperative': 1})
df['buildingMaterial'] = df['buildingMaterial'].map({'brick': 0, 'concreteSlab': 1})


In [11]:
df.drop(columns=['id'], inplace=True)
df.dropna(inplace = True)
df

Unnamed: 0,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
2,1.0,3,48.84,2.0,2.0,3.0,1950.0,53.432222,14.555833,1.04,...,0.121,0.0,0,0,0,1,0,0,0,389000
6,1.0,1,75.00,3.0,4.0,4.0,1999.0,53.427312,14.549056,0.90,...,0.111,1.0,0,1,1,1,0,1,1,890000
10,1.0,1,48.77,2.0,5.0,7.0,2019.0,53.447464,14.557782,2.68,...,0.177,0.0,0,1,1,1,1,0,1,699000
11,1.0,2,61.31,3.0,3.0,3.0,2020.0,53.462647,14.544680,4.50,...,0.189,0.0,0,1,1,0,1,0,0,650000
12,1.0,2,56.58,2.0,4.0,5.0,2019.0,53.439113,14.490044,5.01,...,0.258,0.0,0,1,1,1,1,0,0,580000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16299,14.0,3,39.00,2.0,1.0,4.0,1914.0,53.126000,18.007900,0.65,...,0.205,0.0,0,0,0,0,0,0,1,205000
16324,14.0,2,37.30,2.0,10.0,10.0,1980.0,53.124984,18.049752,3.29,...,0.155,1.0,1,0,0,1,1,0,1,219000
16335,14.0,1,71.72,3.0,1.0,2.0,2012.0,53.124984,18.049752,3.29,...,0.155,0.0,0,1,0,1,1,0,0,849000
16347,14.0,2,74.00,3.0,1.0,4.0,1960.0,53.105804,18.054872,4.04,...,0.277,0.0,0,0,0,0,0,0,1,390000


Training for Price Prediction Model

In [12]:
# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)

# Split into input (X) and output (y) variables
X = scaled_data[:, :-1]  
y = scaled_data[:, -1]   

# Reshape input data to 3D tensor (samples, timesteps, features)
# Assuming you want to consider previous 6 prices to predict the next one (change it as necessary)
timesteps = 6
X_reshaped = np.array([X[i:timesteps+i] for i in range(len(X)-timesteps)])
y_reshaped = y[timesteps:]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_reshaped, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))
model.add(Dense(units=1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.027083128690719604


Save the Model for Price Prediction

In [13]:
model.save('price_prediction_model.h5')

  saving_api.save_model(
