# Challenge Data Analysis

## Import modules

In [1]:
import numpy as np
import pandas as pd

## Files used

In [2]:
filename_in = './data/immoweb_scrapped_2.csv'
filename_out = './data/immoweb_cleaned_2.csv'

## Load data

In [3]:
data = pd.read_csv(filename_in, index_col="id")
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,


## Initial check
Which values are missing?

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53730 entries, 9044081 to 6918766
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   locality             53730 non-null  int64  
 1   type_of_property     53730 non-null  object 
 2   subtype_of_property  53730 non-null  object 
 3   price                50400 non-null  float64
 4   type_of_sale         53730 non-null  object 
 5   nr_of_rooms          50569 non-null  float64
 6   area                 42463 non-null  float64
 7   equiped_kitchen      33156 non-null  object 
 8   furnished            26978 non-null  object 
 9   open_fire            53730 non-null  bool   
 10  terrace              28758 non-null  object 
 11  terrace_area         17376 non-null  float64
 12  garden               14340 non-null  object 
 13  garden_area          8129 non-null   float64
 14  total_land_area      28377 non-null  float64
 15  nr_of_facades        35474 n

In [5]:
sum(data.apply(lambda x: sum(x.isnull().values), axis = 1)==0)

904

## Clean data

### No duplicates

In [6]:
data.index.is_unique

True

### No blank space

In [7]:
# Remove leading and trailing withespaces
def strip_str(x):
    if type(x) is str:
        return x.strip()
    return x

In [8]:
data = data.applymap(strip_str)

In [9]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,


### No errors

##### Price error

In [10]:
data.loc[data['price'] == 1, 'price'] = np.nan

##### Area error

In [11]:
data.loc[data['area'] == 1, 'area'] = np.nan

### No empty values

##### Empty price
Removing all the data without a price


In [12]:
data[data['price'].isna()].shape[0]

3331

In [13]:
data = data.dropna(subset=['price'])

In [14]:
data[data['price'] == data['price'].min()]

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9052002,1000,HOUSE,HOUSE,65.0,FOR_SALE,,,,,False,,,,,,,,


In [15]:
data[data['price'] == data['price'].max()]

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
8982779,7110,HOUSE,OTHER_PROPERTY,35000000.0,FOR_SALE,2.0,,,True,False,,,,,1.0,1.0,,
8963683,1190,HOUSE,HOUSE,35000000.0,FOR_SALE,2.0,113.0,HYPER_EQUIPPED,False,True,,,True,30.0,99.0,2.0,,GOOD
9064050,1850,APARTMENT,SERVICE_FLAT,35000000.0,FOR_SALE,0.0,,,,False,,,,,,4.0,,


##### Empty nb rooms

In [16]:
data = data.dropna(subset=['nr_of_rooms'])

##### Empty area

In [17]:
data = data.dropna(subset=['area'])

##### Equiped kitchen

In [18]:
data['equiped_kitchen'] = data['equiped_kitchen'].replace(np.nan, 'UNK')

##### Furnished

In [19]:
data['furnished'] = data['furnished'].fillna(False)

In [20]:
data['furnished'].value_counts()

False    40917
True      1401
Name: furnished, dtype: int64

##### Terrace


In [21]:
data['terrace'] = data['terrace'].fillna(False)

In [22]:
data['terrace'].value_counts()

True     26302
False    16016
Name: terrace, dtype: int64

##### Terrace area

In [23]:
data[(data['terrace_area'] != data['terrace_area']) & (data['terrace'] == True)].shape[0]

9519

In [24]:
data['terrace_area'] = data['terrace_area'].fillna(-1)

In [25]:
data['terrace_area'].value_counts()

-1.0      25532
 10.0       979
 20.0       959
 15.0       787
 6.0        763
          ...  
 176.0        1
 248.0        1
 500.0        1
 280.0        1
 169.0        1
Name: terrace_area, Length: 211, dtype: int64

##### Garden
Try to determinate the presence on basis of :

- total land area
- terrace_area
- area

In [26]:
# Function that will replace the NaN garden value 
# by a boolean (True/False)
# True if total_land_area > (area + terrace_area)
# otherwise False
def garden_nan_to_bool_value(x):
    # Only if value is NaN
    if x["garden"] != x["garden"]:
        area = x["area"] if x["area"] > 0 else 0
        terrace_area = x["terrace_area"] if x["terrace_area"] > 0 else 0
        return x["total_land_area"] > area + terrace_area
    return x["garden"]

In [27]:
data["garden"] = data.apply(garden_nan_to_bool_value, axis=1)

In [28]:
data['garden'].value_counts()

False    21907
True     20411
Name: garden, dtype: int64

##### Total land area
Try to determinate the garden area on basis of :

- terrace_area
- area

If not possible to determinate it, -1

In [29]:
data['total_land_area'].value_counts()

0.0        2008
150.0       174
100.0       172
1000.0      146
120.0       145
           ... 
2968.0        1
6449.0        1
14260.0       1
2994.0        1
1446.0        1
Name: total_land_area, Length: 3095, dtype: int64

In [30]:
def total_land_area_nan_to_value(x):
    # Only if value is NaN
    if (x["total_land_area"] != x["total_land_area"] or 
        x["total_land_area"] < x["garden_area"] + x["terrace_area"] or
        x["total_land_area"] == 0) :
        area = x["area"] if x["area"] > 0 else 0
        terrace_area = x["terrace_area"] if x["terrace_area"] > 0 else 0
        garden_area = x["garden_area"] if x["garden_area"] > 0 else 0
        return area + terrace_area + garden_area
    return x["total_land_area"]

In [31]:
data["total_land_area"] = data.apply(total_land_area_nan_to_value, axis=1)

##### Garden area
Try to determinate the garden area on basis of :

- total land area
- terrace_area
- area

If not possible to determinate it, -1

In [32]:
# Function that will replace the NaN garden_are value 
# by a int
# if total_land_area > (area + terrace_area)
# otherwise 0
def garden_area_nan_to_value(x):
    # Only if value is NaN
    if x["garden_area"] != x["garden_area"]:
        area = x["area"] if x["area"] > 0 else 0
        terrace_area = x["terrace_area"] if x["terrace_area"] > 0 else 0
        
        if x["total_land_area"] >= area + terrace_area:
            return x["total_land_area"] - area - terrace_area
        else:
            return -1
    return x["garden_area"]

In [33]:
data["garden_area"] = data.apply(garden_area_nan_to_value, axis=1)

In [34]:
data['garden_area'].value_counts()

 0.0        20631
-1.0         3132
 100.0        285
 50.0         239
 200.0        204
            ...  
 29595.0        1
 2371.0         1
 1751.0         1
 2307.0         1
 27444.0        1
Name: garden_area, Length: 2964, dtype: int64

##### Swimming pool

In [35]:
data['swimming_pool'] = data['swimming_pool'].fillna(False)

##### Number of facades

In [36]:
data['nr_of_facades'] = data['nr_of_facades'].fillna(-1)

##### Building condition

In [37]:
data['building_condition'] = data['building_condition'].fillna('UKN')

### Added values

##### Kitchen

In [38]:
#create a new column with condition over 'equiped_kitchen' column
l = ['UNK','NOT_INSTALLED', 'USA_UNINSTALLED']

data['kitchen'] = np.where(data['equiped_kitchen'].isin(l), True, False )

In [39]:
data['kitchen'].value_counts()

False    27896
True     14422
Name: kitchen, dtype: int64

In [40]:
data['equiped_kitchen'].value_counts()

INSTALLED             15451
UNK                   12736
HYPER_EQUIPPED         6235
SEMI_EQUIPPED          3157
USA_HYPER_EQUIPPED     2052
NOT_INSTALLED          1661
USA_INSTALLED           810
USA_SEMI_EQUIPPED       191
USA_UNINSTALLED          25
Name: equiped_kitchen, dtype: int64

##### Region and Province

In [41]:
# Function that will determine the Region
# based on the Locality ZipCode
# 
#1000–1299 -> BXL
#1300–1499 -> WAL
#1500–3999 -> VLA
#4000–4999 -> WAL
#5000–7999 -> WAL
#8000–9999 -> VLA
def get_region_on_zip_code(x):
    # Only if value is NaN
    if (1000 <= x <= 1299):
        return "BXL"
    elif (1300 <= x <= 1499) or (4000 <= x <= 7999):
        return "WAL"
    else:
        return "VLA"

In [42]:
data["region"] = data.locality.apply(get_region_on_zip_code)

In [43]:
data["region"].value_counts()

VLA    26221
WAL    10725
BXL     5372
Name: region, dtype: int64

In [44]:
# function to recognize the province based on the postal code
def province(x):
    prov = []
    for i in x["locality"]:
        if 2000 <= i <= 2999:  # Antwerp
            prov.append("VAN")
        if 3500 <= i <= 3999:  # Limburg
            prov.append("VLI")
        if 4000 <= i <= 4999:  # Liège
            prov.append("WLG")
        if 1300 <= i <= 1499:  # Brabant Wallon
            prov.append("WBR")
        if 5000 <= i <= 5999:  # Namur
            prov.append("WNA") 
        if 6600 <= i <= 6999:  # Luxembourg
            prov.append("WLX")            
        if 1000 <= i <= 1299:  # Brussels
            prov.append("BXL")
        if (1500 <= i <= 1999) | (3000 <= i <= 3499):  # Brabant Flamand
            prov.append("VBR")
        if (6000 <= i <= 6599) | (7000 <= i <= 7999):  # Hainaut
            prov.append("WHT")            
        if 8000 <= i <= 8999:  # Flander Occidentale
            prov.append("VWV") 
        if 9000 <= i <= 9999:  # Flander Orientale
            prov.append("VOV")
    return prov

In [45]:
data["province"] = province(data)

In [46]:
data["province"].value_counts()

VWV    7544
VAN    6502
VOV    5871
BXL    5372
WHT    4665
VBR    4303
VLI    2001
WBR    1716
WNA    1661
WLG    1464
WLX    1219
Name: province, dtype: int64

##### Price meter price
On the build area and the total land area

In [47]:
# Price / Area 
def price_sq_meter(x):
    if x["area"] > 0:
        return x["price"]/x["area"]
    return -1

In [48]:
data["sq_m_price"] = data.apply(price_sq_meter, axis=1)

In [49]:
data["sq_m_price"].value_counts()

2500.000000    264
3000.000000    181
2000.000000    152
1666.666667    115
1500.000000    107
              ... 
2141.085271      1
1997.991968      1
389.408100       1
4077.777778      1
2319.277108      1
Name: sq_m_price, Length: 18360, dtype: int64

In [50]:
# Price / Total land area
def price_sq_meter_land(x):
    if x["total_land_area"] > 0:
        return x["price"]/x["total_land_area"]
    return -1

In [51]:
data["sq_m_land_price"] = data.apply(price_sq_meter_land, axis=1)

In [52]:
data["sq_m_land_price"].value_counts()

2500.000000    161
3000.000000    108
2000.000000     80
1500.000000     70
1666.666667     63
              ... 
439.493050       1
7195.945946      1
482.695811       1
295.169946       1
262.844674       1
Name: sq_m_land_price, Length: 24949, dtype: int64

## Conversion


##### bool -> 0/1

In [53]:
def bool_to_int(x):
    if type(x) is bool:
        if x:
            return 1
        else:
            return 0
    return x

In [54]:
data = data.applymap(bool_to_int)

In [55]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,...,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition,kitchen,region,province,sq_m_price,sq_m_land_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,0,0,...,0.0,103.0,4.0,0,AS_NEW,0,BXL,BXL,2944.444444,2572.815534
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,0,1,...,0.0,1050.0,3.0,0,AS_NEW,0,BXL,BXL,2761.538462,1709.52381
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,0,0,...,-1.0,340.0,2.0,0,JUST_RENOVATED,0,BXL,BXL,5053.191489,11176.470588
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,0,0,...,1200.0,1421.0,3.0,0,AS_NEW,1,WAL,WLG,1385.281385,225.193526
9043036,9600,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,75.0,INSTALLED,0,0,...,0.0,75.0,2.0,0,GOOD,0,VLA,VOV,2600.0,2600.0


##### Round float to 2 decimal

In [56]:
def float_round_2_decimal(x):
    if type(x) is float:
        return round(x, 2)
    return x

In [57]:
data = data.applymap(float_round_2_decimal)

##### Convert text dictionary to numeric
Allows us to have correlation on it

In [58]:
dic_type_of_property = {
        "APARTMENT": 1,
        "HOUSE":     2
    }

data["type_of_property_num"] = data["type_of_property"].apply(lambda x: dic_type_of_property.get(x))

In [59]:
dic_subtype_of_property = {
        "APARTMENT":1,
        "MANSION":2,
        "HOUSE":3,
        "TRIPLEX":4,
        "VILLA":5,
        "FLAT_STUDIO":6,
        "EXCEPTIONAL_PROPERTY":7,
        "LOFT":8,
        "DUPLEX":9,
        "SERVICE_FLAT":10,
        "TOWN_HOUSE":11,
        "FARMHOUSE":12,
        "PENTHOUSE":13,
        "BUNGALOW":14,
        "APARTMENT_BLOCK":15,
        "COUNTRY_COTTAGE":15,
        "MIXED_USE_BUILDING":17,
        "GROUND_FLOOR":17,
        "OTHER_PROPERTY":19,
        "MANOR_HOUSE":20,
        "CHALET":21,
        "KOT":22,
        "CASTLE":23
    }
data["subtype_of_property_num"] = data["subtype_of_property"].apply(lambda x: dic_subtype_of_property.get(x))

In [60]:
dic_type_of_sale = {"FOR_SALE": 1}
data["type_of_sale_num"] = data["type_of_sale"].apply(lambda x: dic_type_of_sale.get(x))

In [61]:
dic_equiped_kitchen = {
        "INSTALLED": 1,
        "USA_HYPER_EQUIPPED": 2,
        "HYPER_EQUIPPED": 3,
        "NOT_INSTALLED": 4,
        "SEMI_EQUIPPED": 5,
        "USA_INSTALLED": 6,
        "USA_SEMI_EQUIPPED": 7,
        "UNK": -1,
        "USA_UNINSTALLED": 8
    }
data["equiped_kitchen_num"] = data["equiped_kitchen"].apply(lambda x: dic_equiped_kitchen.get(x))

In [62]:
dic_building_condition = {
        "AS_NEW": 1,
        "JUST_RENOVATED": 2,
        "GOOD": 3,
        "TO_BE_DONE_UP": 4,
        "TO_RENOVATE": 5,
        "UKN": -1,
        "TO_RESTORE": 6
    }
data["building_condition_num"] = data["building_condition"].apply(lambda x: dic_building_condition.get(x))

In [63]:
dic_region = {
        "BXL": 0,
        "VLA": 1,
        "WAL": 2
    }
data["region_num"] = data["region"].apply(lambda x: dic_region.get(x))

In [64]:
dic_province = {
        "BXL": 1,
        "VAN": 2,
        "VBR": 3,
        "VLI": 4,
        "VOV": 5,
        "VWV": 6,
        "WBR": 7,
        "WHT": 8,
        "WLG": 9,
        "WLX": 10,
        "WNA": 11
    }
data["province_num"] = data["province"].apply(lambda x: dic_province.get(x))

## Fix dtype

In [65]:
data.dtypes

locality                     int64
type_of_property            object
subtype_of_property         object
price                      float64
type_of_sale                object
nr_of_rooms                float64
area                       float64
equiped_kitchen             object
furnished                    int64
open_fire                    int64
terrace                      int64
terrace_area               float64
garden                       int64
garden_area                float64
total_land_area            float64
nr_of_facades              float64
swimming_pool                int64
building_condition          object
kitchen                      int64
region                      object
province                    object
sq_m_price                 float64
sq_m_land_price            float64
type_of_property_num         int64
subtype_of_property_num      int64
type_of_sale_num             int64
equiped_kitchen_num          int64
building_condition_num       int64
region_num          

In [66]:
convert_dict = {'price': int,
                'nr_of_rooms': int,
                'area': int,
                'furnished': int,
                'terrace_area': int,
                'garden_area': int, 
                'nr_of_facades': int,
                'total_land_area': int 
               } 
  
data = data.astype(convert_dict)

In [67]:
data.dtypes

locality                     int64
type_of_property            object
subtype_of_property         object
price                        int64
type_of_sale                object
nr_of_rooms                  int64
area                         int64
equiped_kitchen             object
furnished                    int64
open_fire                    int64
terrace                      int64
terrace_area                 int64
garden                       int64
garden_area                  int64
total_land_area              int64
nr_of_facades                int64
swimming_pool                int64
building_condition          object
kitchen                      int64
region                      object
province                    object
sq_m_price                 float64
sq_m_land_price            float64
type_of_property_num         int64
subtype_of_property_num      int64
type_of_sale_num             int64
equiped_kitchen_num          int64
building_condition_num       int64
region_num          

In [68]:
data.shape

(42318, 30)

In [69]:
data.head(100)

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,...,province,sq_m_price,sq_m_land_price,type_of_property_num,subtype_of_property_num,type_of_sale_num,equiped_kitchen_num,building_condition_num,region_num,province_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9044081,1083,APARTMENT,APARTMENT,265000,FOR_SALE,4,90,INSTALLED,0,0,...,BXL,2944.44,2572.82,1,1,1,1,1,0,1
9043978,1000,APARTMENT,APARTMENT,1795000,FOR_SALE,4,650,USA_HYPER_EQUIPPED,0,1,...,BXL,2761.54,1709.52,1,1,1,2,1,0,1
9044188,1050,HOUSE,MANSION,3800000,FOR_SALE,5,752,HYPER_EQUIPPED,0,0,...,BXL,5053.19,11176.47,2,2,1,3,2,0,1
9041095,4860,HOUSE,HOUSE,320000,FOR_SALE,5,231,NOT_INSTALLED,0,0,...,WLG,1385.28,225.19,2,3,1,4,1,2,9
9043036,9600,APARTMENT,APARTMENT,195000,FOR_SALE,2,75,INSTALLED,0,0,...,VOV,2600.00,2600.00,1,1,1,1,3,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9043293,8020,HOUSE,HOUSE,495000,FOR_SALE,3,217,HYPER_EQUIPPED,0,1,...,VWV,2281.11,473.68,2,3,1,3,1,1,6
9007296,5060,APARTMENT,DUPLEX,318000,FOR_SALE,3,118,HYPER_EQUIPPED,0,0,...,WNA,2694.92,2271.43,1,9,1,3,3,2,11
9029109,4140,HOUSE,BUNGALOW,349000,FOR_SALE,3,180,SEMI_EQUIPPED,0,0,...,WLG,1938.89,208.48,2,14,1,5,1,2,9
9029927,5580,HOUSE,HOUSE,599000,FOR_SALE,4,320,INSTALLED,0,1,...,WNA,1871.88,127.58,2,3,1,1,1,2,11


### Delete unrelevant rows

##### Area unknown
Area must be > 0

In [70]:
data = data[data["area"] > 0]

##### Apartement block
They are whole building and not just a real estate property 

In [71]:
data = data[data["subtype_of_property"] != 'APARTMENT_BLOCK']

##### Price / SqM unknown
Price / Square Meter must be > 0

In [72]:
data = data[data["sq_m_price"] > 0]

##### Price /SqM_land > Price / SqM_area 

In [73]:
data[data["sq_m_land_price"] > data["sq_m_price"]].shape

(3545, 30)

In [74]:
data = data[data["sq_m_land_price"] <= data["sq_m_price"]]

##### Number of rooms must occurs at least 5 times

In [75]:
data = data[data["nr_of_rooms"].duplicated(keep=False).groupby(data["nr_of_rooms"]).transform('sum').gt(4)]
data[["nr_of_rooms"]].value_counts()


nr_of_rooms
2              12722
3              12111
4               4842
1               4248
5               1931
0                730
6                714
7                241
8                130
9                 54
10                42
12                23
11                20
15                 6
18                 5
dtype: int64

##### Suspect area vs Number of rooms
A room must at least have 9m2

In [76]:
data = data[data["nr_of_rooms"]*9 <= data["area"]]

### Delete unrelevant columns

##### Type of sale
Always FOR_SALE

In [77]:
data.drop(["type_of_sale"], axis=1, inplace=True)
data.drop(["type_of_sale_num"], axis=1, inplace=True)

##### Furnished

In [78]:
data.drop(["furnished"], axis=1, inplace=True)

### Final data check

In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37813 entries, 9044081 to 8858590
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   locality                 37813 non-null  int64  
 1   type_of_property         37813 non-null  object 
 2   subtype_of_property      37813 non-null  object 
 3   price                    37813 non-null  int64  
 4   nr_of_rooms              37813 non-null  int64  
 5   area                     37813 non-null  int64  
 6   equiped_kitchen          37813 non-null  object 
 7   open_fire                37813 non-null  int64  
 8   terrace                  37813 non-null  int64  
 9   terrace_area             37813 non-null  int64  
 10  garden                   37813 non-null  int64  
 11  garden_area              37813 non-null  int64  
 12  total_land_area          37813 non-null  int64  
 13  nr_of_facades            37813 non-null  int64  
 14  swimming_pool 

### Save data cleaned

In [80]:
data.to_csv(filename_out)