# Apartments costs regression model

## Imports

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

##  Data exploration

In [2]:
# Load a dataset
raw_data = pd.read_csv("apartments_pl_2023q3.csv")

In [9]:
# Check the dataset size
raw_data.shape

(19478, 27)

In [6]:
# Get all columns names
raw_data.columns


Index(['city', 'type', 'squareMeters', 'rooms', 'floor', 'floorCount',
       'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount',
       'schoolDistance', 'clinicDistance', 'postOfficeDistance',
       'kindergartenDistance', 'restaurantDistance', 'collegeDistance',
       'pharmacyDistance', 'ownership', 'buildingMaterial', 'condition',
       'hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity',
       'hasStorageRoom', 'price'],
      dtype='object')

In [11]:
# Get sample data from the dataset
raw_data.sample(5)

Unnamed: 0,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
7533,gdansk,tenement,100.65,5.0,1.0,2.0,1910.0,54.381931,18.605442,4.81,...,0.162,condominium,brick,,no,no,no,no,yes,829000
18769,bydgoszcz,,35.0,2.0,2.0,4.0,,53.10945,18.03534,2.7,...,0.142,condominium,,,no,yes,no,no,no,289000
13653,warszawa,blockOfFlats,76.0,4.0,6.0,7.0,1993.0,52.2506,21.0397,3.24,...,0.02,cooperative,,,yes,yes,yes,yes,yes,940000
13865,warszawa,,50.0,2.0,14.0,15.0,2002.0,52.282382,21.065368,7.03,...,0.755,condominium,,,yes,no,yes,yes,no,679000
5890,gdansk,blockOfFlats,60.3,3.0,1.0,4.0,1984.0,54.33883,18.60951,3.14,...,0.249,condominium,concreteSlab,,no,yes,no,no,yes,695000


In [17]:
# Check % of NaN in each column
raw_data.isna().sum() /  100 / len(raw_data)

city                    0.000000
type                    0.002097
squareMeters            0.000000
rooms                   0.000000
floor                   0.001828
floorCount              0.000137
buildYear               0.001711
latitude                0.000000
longitude               0.000000
centreDistance          0.000000
poiCount                0.000000
schoolDistance          0.000007
clinicDistance          0.000045
postOfficeDistance      0.000013
kindergartenDistance    0.000007
restaurantDistance      0.000030
collegeDistance         0.000319
pharmacyDistance        0.000014
ownership               0.000000
buildingMaterial        0.003897
condition               0.007603
hasParkingSpace         0.000000
hasBalcony              0.000000
hasElevator             0.000476
hasSecurity             0.000000
hasStorageRoom          0.000000
price                   0.000000
dtype: float64

## Data preparation

In [None]:
"""
Since a linear regression won't accept variables with nulls, we may need to get rid of nulls before bulding a model.
Replacing NaNs with 0 won't work in this case because it will mess up the calculations. So my solution was to drop rows with NaNs.
We don't have a variable with a lot of missing values, but if there were such variables, I'd consider not choosing it for my model.
"""

In [13]:
# Get only columns we are going to use to build a model
real_estate_df = raw_data[['city', 'type', 'squareMeters', 'rooms', 'buildYear', 'price']]

In [18]:
# Get rid of rows with NaNs
real_estate_df = real_estate_df.dropna()

In [19]:
# Check % of NaN after dropping
real_estate_df.isna().sum() /  100 / len(real_estate_df)

city            0.0
type            0.0
squareMeters    0.0
rooms           0.0
buildYear       0.0
price           0.0
dtype: float64

In [None]:
"""
A categorical variable "type" has been replaced with a binary variable,
and the "city" variable has been replaced with city population nr taken from here: https://www.polskawliczbach.pl/najwieksze_miasta_w_polsce_pod_wzgledem_liczby_ludnosci
"""

In [20]:
# Check how many unique values this column has
real_estate_df['type'].unique()

array(['blockOfFlats', 'apartmentBuilding', 'tenement'], dtype=object)

In [23]:
# Use one-hot encoding
real_estate_df = pd.get_dummies(real_estate_df, columns = ['type'])

In [21]:
# Check unique values of the city variable
real_estate_df['city'].unique()

array(['szczecin', 'gdynia', 'krakow', 'poznan', 'bialystok', 'gdansk',
       'wroclaw', 'radom', 'rzeszow', 'lodz', 'katowice', 'lublin',
       'czestochowa', 'warszawa', 'bydgoszcz'], dtype=object)

In [27]:
# Create a dictionary with cities population
cities_population_dict = {'szczecin':391566, 'gdynia':242874,
       'krakow':803282, 'poznan':541316, 'bialystok':292600 ,
       'wroclaw':674079 , 'radom':197848, 'rzeszow':197181 , 'lodz':658444,
       'katowice':280190, 'lublin':331243, 'gdansk':486345 ,
       'czestochowa':208282 , 'warszawa':1861975, 'bydgoszcz':330038}

In [30]:
# Create a new column, then drop the city column
real_estate_df['city_population'] = real_estate_df['city'].map(cities_population_dict)
real_estate_df = real_estate_df.drop(columns=['city'])

In [31]:
real_estate_df.sample(5)

Unnamed: 0,squareMeters,rooms,buildYear,price,type_apartmentBuilding,type_blockOfFlats,type_tenement,city_population
4599,49.0,2.0,2000.0,465000,0,1,0,541316
9921,36.0,2.0,1970.0,295000,0,1,0,658444
1458,37.89,2.0,1930.0,649000,0,0,1,803282
3201,50.75,3.0,2016.0,1116449,1,0,0,803282
7203,54.8,3.0,1980.0,669000,0,1,0,486345


## Create a model

### Data vizualization

### Feature scaling

### Compute cost

### Gradient descent

### Batch gradient descent

## Test the model