In [376]:
import json
import math as math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline


np.random.seed(123)
sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

In [377]:
def describe_column(meta):
    """
    Utility function for describing a dataset column (see below for usage)
    """
    def f(x):
        d = pd.Series(name=x.name, dtype=object)
        m = next(m for m in meta if m['name'] == x.name)
        d['Type'] = m['type']
        d['#NaN'] = x.isna().sum()
        d['Description'] = m['desc']
        if m['type'] == 'categorical':
            counts = x.dropna().map(dict(enumerate(m['cats']))).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        elif m['type'] == 'real' or m['type'] == 'integer':
            stats = x.dropna().agg(['mean', 'std', 'min', 'max'])
            d['Statistics'] = ', '.join(f'{s}={v :.1f}' for s, v in stats.items())
        elif m['type'] == 'boolean':
            counts = x.dropna().astype(bool).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        else:
            d['Statistics'] = f'#unique={x.nunique()}'
        return d
    return f

def describe_data(data, meta):
    desc = data.apply(describe_column(meta)).T
    desc = desc.style.set_properties(**{'text-align': 'left'})
    desc = desc.set_table_styles([ dict(selector='th', props=[('text-align', 'left')])])
    return desc

In [378]:
apartments = pd.read_csv('apartments_train.csv')
print(f'Loaded {len(apartments)} apartments')
with open('apartments_meta.json') as f:
    apartments_meta = json.load(f)
describe_data(apartments, apartments_meta)

Loaded 23285 apartments


Unnamed: 0,Type,#NaN,Description,Statistics
id,integer_id,0,Unique ID for apartment,#unique=23285
seller,categorical,8830,The type of apartment seller,"Agents(1895), Company(4768), Developer(6185), Owner(1607)"
price,real,0,The listed price of the apartment (TARGET),"mean=23556173.5, std=52643927.8, min=900000.0, max=2600000000.0"
area_total,real,0,Total area of the apartment,"mean=74.5, std=58.7, min=9.3, max=2181.0"
area_kitchen,real,4721,Total kitchen area in the apartment,"mean=12.6, std=6.6, min=1.0, max=100.0"
area_living,real,3882,Total living space area in the apartment,"mean=38.7, std=31.4, min=0.0, max=900.0"
floor,integer,0,Primary building floor of the apartment,"mean=9.0, std=8.3, min=1.0, max=95.0"
rooms,integer,0,Number of rooms in the apartment,"mean=2.2, std=1.1, min=1.0, max=6.0"
layout,categorical,17642,Overal apartment layout,"Adjacent(241), Adjacent_isolated(465), Isolated(4937)"
ceiling,real,11093,Ceiling height in the apartment,"mean=3.3, std=10.8, min=0.0, max=340.0"


In [379]:
buildings = pd.read_csv('buildings_train.csv')
print(f'Loaded {len(buildings)} buildings')
with open('buildings_meta.json') as f:
    buildings_meta = json.load(f)
buildings.head()
describe_data(buildings, buildings_meta)

Loaded 6791 buildings


Unnamed: 0,Type,#NaN,Description,Statistics
id,integer_id,0,Unique ID of building for joining with apartments,#unique=6791
new,boolean,227,Whether it is an old or new building,"False(6141), True(423)"
latitude,real,0,Latitude coordinate of building,"mean=55.7, std=0.1, min=55.2, max=56.0"
longitude,real,0,Longitude coordinate of building,"mean=37.6, std=0.2, min=36.9, max=38.0"
district,categorical,1,Administriative district within Moscow,"Central(637), East(956), North(593), North-East(630), North-West(553), Novomoskovsk(344), South(758), South-East(672), South-West(900), Troitsk(146), West(535), Zelenograd(66)"
street,string,0,Bulding street name,#unique=1682
address,string,0,Building address (within street),#unique=1851
constructed,integer,479,Year when the building was constructed,"mean=1985.8, std=24.3, min=1855.0, max=2023.0"
material,categorical,1165,Primary building material used in building,"Block(679), Bricks(1226), Monolith(1312), Monolithic_brick(19), Panel(2384), Stalin_project(2), Wood(4)"
stories,integer,0,Total number of floors in the building,"mean=13.0, std=7.0, min=1.0, max=95.0"


In [380]:
print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
data.head()

All apartments have an associated building: True


Unnamed: 0,id,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,...,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating
0,0,3.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,...,к2.5/2,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,
1,1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
2,2,3.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,...,38,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,
3,3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
4,4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.9,...,1к3,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0


In [381]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all()
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

Now we want to find which attributes correlates the most with the pricing, just to check the data.

In [382]:
print(data.corr(method = 'pearson')['price'])

id                   -0.027967
seller                0.101015
price                 1.000000
area_total            0.814829
area_kitchen          0.364758
area_living           0.707580
floor                 0.132206
rooms                 0.411317
layout                0.130822
ceiling               0.031640
bathrooms_shared      0.332421
bathrooms_private     0.133364
windows_court        -0.005345
windows_street        0.180263
balconies             0.102928
loggias               0.024877
condition             0.109338
phones                0.042738
building_id           0.052159
new                  -0.089292
latitude              0.054341
longitude            -0.034022
district             -0.182404
constructed           0.047633
material             -0.100614
stories               0.086448
elevator_without      0.162024
elevator_passenger    0.021846
elevator_service      0.024681
parking              -0.199476
garbage_chute         0.029171
heating               0.087359
Name: pr

We can now see that total area is what affects the price the most, as this is the value which correleates the most with price.
Another interesting thing is thah longitude and latitude doesnt corelate to the price, but what about if we cahnge it into something a little bit easier to understand. For example radius

In [383]:
data_with_radius = data
data_with_radius['radius'] = np.sqrt((data_with_radius['latitude']-55.75)**2 + (data_with_radius['longitude']-37.55)**2)


print(data_with_radius.corr(method='pearson')['price'])
data_with_radius.head()

id                   -0.027967
seller                0.101015
price                 1.000000
area_total            0.814829
area_kitchen          0.364758
area_living           0.707580
floor                 0.132206
rooms                 0.411317
layout                0.130822
ceiling               0.031640
bathrooms_shared      0.332421
bathrooms_private     0.133364
windows_court        -0.005345
windows_street        0.180263
balconies             0.102928
loggias               0.024877
condition             0.109338
phones                0.042738
building_id           0.052159
new                  -0.089292
latitude              0.054341
longitude            -0.034022
district             -0.182404
constructed           0.047633
material             -0.100614
stories               0.086448
elevator_without      0.162024
elevator_passenger    0.021846
elevator_service      0.024681
parking              -0.199476
garbage_chute         0.029171
heating               0.087359
radius  

Unnamed: 0,id,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,...,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating,radius
0,0,3.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,...,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,,0.218159
1,1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,...,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,0.161215
2,2,3.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,...,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,,0.093374
3,3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,...,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,0.161215
4,4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.9,...,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0,0.187254


After centering the radius, the radius achieved a good correleation. I centered it manually by looking at the map of moscov apartments.

Now lets start looking at the log of the price instead

In [384]:
data_with_radius['price'] = np.log(data_with_radius['price'])
print(data_with_radius.corr(method='pearson')['price'])

id                   -0.043946
seller                0.182874
price                 1.000000
area_total            0.782565
area_kitchen          0.517461
area_living           0.749629
floor                 0.212198
rooms                 0.678219
layout                0.178670
ceiling               0.051916
bathrooms_shared      0.396012
bathrooms_private     0.168706
windows_court        -0.009696
windows_street        0.289950
balconies             0.094621
loggias               0.104360
condition             0.173769
phones                0.077263
building_id           0.054510
new                  -0.072046
latitude              0.162631
longitude            -0.067534
district             -0.276415
constructed           0.106131
material             -0.235145
stories               0.233266
elevator_without      0.224828
elevator_passenger    0.095696
elevator_service      0.115201
parking              -0.392306
garbage_chute         0.004026
heating               0.130949
radius  

Now we can see that the radius is even more important, as we use the log of the price. This shows that the radius is more useful when we make the target data more similar.

In [None]:
import sklearn.model_selection as model_selection

data_train, data_valid = model_selection.train_test_split(data_with_radius, test_size=0.33,stratify=np.log(data_with_radius.price).round())
fig, (ax1, ax2) = plt.subplots(figsize=(12, 4), ncols=2, dpi=100)
print(f'Split dataset into {len(data_train)} training samples and {len(data_valid)} validation samples')

sns.histplot(np.log10(data_train.price).rename('log10(price)'), ax=ax1);
sns.histplot(np.log10(data_valid.price).rename('log10(price)'), ax=ax2);
ax1.set_title('Training set log prices'); ax2.set_title('Validation set log prices');

lets train the dataset on the radius, and see how we perform.

In [387]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X_train = np.array(data_train['radius']).reshape(-1,1)
Y_train = np.array(data_train['price'])

X_valid = np.array(data_valid['radius']).reshape(-1,1)
Y_valid = np.array(data_valid['price'])


reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X_train, Y_train)

prediction = reg.predict(X_train)
prediction = 10**prediction
Y_train = 10**Y_train
print(f'Train RMSLE: {root_mean_squared_log_error(y_true=Y_train, y_pred=prediction) :.4f}')

prediction_2 = reg.predict(X_valid)
prediction_2 = 10**prediction_2
Y_valid = 10**Y_valid
print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=Y_valid, y_pred=prediction_2) :.4f}')




Train RMSLE: 1.6528
Valid RMSLE: 1.6549
