In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from moscow_housing.display_data import import_data

plt.style.use('ggplot')
%matplotlib inline

In this notebook we will do EDA (explanatory data analysis). We will look at dataset standard statistics, individual feature analysis and finding any relations or trends considering multiple features. As a conlusion we will suggest features which can be added as a result of EDA

In [19]:
#import data
data_train, data_test = import_data()

#sometimes we have to look at all the data togheter
all_data = pd.concat([data_train,data_test])
all_data

Unnamed: 0,id,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,...,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating
0,0,3.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,...,к2.5/2,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,
1,1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
2,2,3.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,...,38,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,
3,3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
4,4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.90,...,1к3,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,33217,3.0,,106.0,19.9,56.7,16.0,3.0,,3.30,...,10А,2020.0,2.0,20.0,0.0,1.0,1.0,0.0,,0.0
9933,33218,,,82.0,,,3.0,3.0,,,...,к1,2021.0,2.0,20.0,0.0,1.0,1.0,1.0,,
9934,33219,,,49.3,,,15.0,1.0,,0.00,...,70к5,2016.0,2.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0
9935,33220,,,38.8,10.5,15.1,14.0,1.0,,3.30,...,1,2019.0,,14.0,1.0,1.0,1.0,0.0,1.0,


In [20]:
#Now lets look at all the basics of the data
print('Number of Training Examples = {}'.format(data_train.shape[0]))
print('Number of Test Examples = {}\n'.format(data_test.shape[0]))
print('Training X Shape = {}'.format(data_train.shape))
print('Training y Shape = {}\n'.format(data_train['price'].shape[0]))
print('Test X Shape = {}'.format(data_test.shape))
print('Test y Shape = {}\n'.format(data_test.shape[0]))
print('Train columns \n', list(data_train.columns))
print('Test columns \n', list(data_test.columns))

#check type of each column, and see how many duplicates we have:
for column in data_train.columns:
    print('\n - type of column: ', data_train.dtypes[column],'\n - number of null values: ', data_train[column].isnull().sum(), '\n - number of unique inputs: ', data_train[column].value_counts().count(),'\n', data_train[column].value_counts())


Number of Training Examples = 23285
Number of Test Examples = 9937

Training X Shape = (23285, 34)
Training y Shape = 23285

Test X Shape = (9937, 33)
Test y Shape = 9937

Train columns 
 ['id', 'seller', 'price', 'area_total', 'area_kitchen', 'area_living', 'floor', 'rooms', 'layout', 'ceiling', 'bathrooms_shared', 'bathrooms_private', 'windows_court', 'windows_street', 'balconies', 'loggias', 'condition', 'phones', 'building_id', 'new', 'latitude', 'longitude', 'district', 'street', 'address', 'constructed', 'material', 'stories', 'elevator_without', 'elevator_passenger', 'elevator_service', 'parking', 'garbage_chute', 'heating']
Test columns 
 ['id', 'seller', 'area_total', 'area_kitchen', 'area_living', 'floor', 'rooms', 'layout', 'ceiling', 'bathrooms_shared', 'bathrooms_private', 'windows_court', 'windows_street', 'balconies', 'loggias', 'condition', 'phones', 'building_id', 'new', 'latitude', 'longitude', 'district', 'street', 'address', 'constructed', 'material', 'stories', 'el

As we can see, the training and test set have the same number of columns except for price. Lets look at little bit closer at these features.
### **Features description**
* `id` - Unique ID for apartment
    - type of column:  int64
    - number of null values:  0
    - number of unique inputs:  23285
* `seller` - The type of apartment seller
 - type of column:  float64
 - number of null values:  8830
 - number of unique inputs:  4
* `price` - The listed price of the apartment (TARGET: only available in train)
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  9504
* `area_total` - Total area of the apartment
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  3249
* `area_kitchen` - Total kitchen area in the apartment
 - type of column:  float64
 - number of null values:  4721
 - number of unique inputs:  385
* `area_living` - Total living space area in the apartment
 - type of column:  float64
 - number of null values:  3882
 - number of unique inputs:  1041
* `floor`- Primary building floor of the apartment
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  84
* `rooms` - Number of rooms in the apartment
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  6
* `layout` - Overal apartment layout
 - type of column:  float64
 - number of null values:  17642
 - number of unique inputs:  3
* `ceiling` - Ceiling height in the apartment
 - type of column:  float64
 - number of null values:  11093
 - number of unique inputs:  121
* `bathrooms_shared` - Number of shared bathrooms
 - type of column:  float64
 - number of null values:  3872
 - number of unique inputs:  5
* `bathrooms_private` - Number of private bathrooms
 - type of column:  float64
 - number of null values:  3872
 - number of unique inputs:  5
* `windows_court` - Whether the apartment has windows facing a courtyard
 - type of column:  float64
 - number of null values:  8072
 - number of unique inputs:  2
* `windows_street` - Whether the apartment has windows facing a street
 - type of column:  float64
 - number of null values:  8072
 - number of unique inputs:  2
* `balconies` - Number of balconies in the apartment
 - type of column:  float64
 - number of null values:  10520
 - number of unique inputs:  5
* `loggias`- Number of loggias in the apartment (balcony-like, google it)
 - type of column:  float64
 - number of null values:  10520
 - number of unique inputs:  5
* `condition` - Current condition of the apartment
 - type of column:  float64
 - number of null values:  9810
 - number of unique inputs:  4
* `phones` - Number of phone numbers associated with the advert
 - type of column:  float64
 - number of null values:  80
 - number of unique inputs:  3
* `building_id` - ID used to map apartments to buildings
 - type of column:  int64
 - number of null values:  0
 - number of unique inputs:  6791
* `new` - Whether it is an old or new building
 - type of column:  float64
 - number of null values:  264
 - number of unique inputs:  2
* `latitude` - Latitude coordinate of building
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  6208
* `longitude` - Longitude coordinate of building
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  6159
* `district` - Administriative district within Moscow
 - type of column:  float64
 - number of null values:  130
 - number of unique inputs:  12
* `street` - Bulding street name
 - type of column:  object
 - number of null values:  0
 - number of unique inputs:  1682
* `address` - Building address (within street)
 - type of column:  object
 - number of null values:  0
 - number of unique inputs:  1851
* `constructed` - Year when the building was constructed
 - type of column:  float64
 - number of null values:  794
 - number of unique inputs:  128
* `material` - Primary building material used in building
 - type of column:  float64
 - number of null values:  3972
 - number of unique inputs:  7
* `stories` - Total number of floors in the building
 - type of column:  float64
 - number of null values:  0
 - number of unique inputs:  62
* `elevator_without` - Whether the building has apartments without elevator access
 - type of column:  float64
 - number of null values:  357
 - number of unique inputs:  2
* `elevator_passenger` - Whether the building has apartments with passenger elevator access
 - type of column:  float64
 - number of null values:  357
 - number of unique inputs:  2
* `elevator_service` - Whether the building has apartments with service elevator access
 - type of column:  float64
 - number of null values:  357
 - number of unique inputs:  2
* `parking` - Parking options for building
 - type of column:  float64
 - number of null values:  6788
 - number of unique inputs:  3
* `garbage_chute` - Whether the building has a garbage chute system
 - type of column:  float64
 - number of null values:  8811
 - number of unique inputs:  2
* `heating` - Primary heating system used in the building
 - type of column:  float64
 - number of null values:  8161
 - number of unique inputs:  4


how to deal with binary values?