# Challenge Data Analysis

### Import modules

In [1]:
import numpy as np
import pandas as pd

### Files used

In [2]:
filename_in = './data/immoweb_scrapped.csv'
filename_out = './data/immoweb_cleaned.csv'

### Load data

In [3]:
data = pd.read_csv(filename_in)
data.head()

Unnamed: 0,id,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
0,9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
1,9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
2,9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
3,9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
4,9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,


### Clean data

#### No duplicates

In [4]:
data = data.set_index("id")
data.index.is_unique

True

In [5]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,


#### No blank space

In [6]:
# Remove leading and trailing withespaces
def strip_str(x):
    if type(x) is str:
        return x.strip()
    return x

In [7]:
data.applymap(strip_str)

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8757737,4400,HOUSE,HOUSE,289500.0,FOR_SALE,4.0,190.0,NOT_INSTALLED,,False,True,,True,,575.0,4.0,,
8898104,2500,APARTMENT,GROUND_FLOOR,359000.0,FOR_SALE,3.0,130.0,HYPER_EQUIPPED,,False,True,13.0,True,95.0,,2.0,,AS_NEW
8967132,3800,APARTMENT,APARTMENT,365000.0,FOR_SALE,2.0,146.0,SEMI_EQUIPPED,,False,,,,,,4.0,,GOOD
8931948,1200,APARTMENT,PENTHOUSE,754500.0,FOR_SALE,2.0,159.0,USA_HYPER_EQUIPPED,False,False,True,76.0,,,,2.0,False,AS_NEW


In [8]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9042175,1160,APARTMENT_GROUP,APARTMENT_GROUP,,FOR_SALE,,,,,False,,,,,,,,


#### No errors

#### No empty values

In [9]:
#removing all the rows without a price
data = data.dropna(subset=['price'])

In [10]:
def area_adj(x):
    if x['area'] != x['area']:
        if x['total_land_area'] > x['garden_area']:
            return x['total_land_area'] - x['garden_area']
        else:
            return -1
    else:
        return x['area']

In [11]:
data['area'] = data.apply(area_adj, axis= 1)

In [14]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,,,,4.0,,AS_NEW
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,,,,3.0,,AS_NEW
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,,340.0,2.0,,JUST_RENOVATED
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW
9043036,9600,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,75.0,INSTALLED,,False,,,,,,2.0,,GOOD


In [15]:
data['area'].isna().sum()

0

In [16]:
data['terrace'] = data['terrace'].fillna(False)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9054 entries, 9044081 to 8471905
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   locality             9054 non-null   int64  
 1   type_of_property     9054 non-null   object 
 2   subtype_of_property  9054 non-null   object 
 3   price                9054 non-null   float64
 4   type_of_sale         9054 non-null   object 
 5   nr_of_rooms          9054 non-null   float64
 6   area                 9054 non-null   float64
 7   equiped_kitchen      7759 non-null   object 
 8   furnished            5115 non-null   object 
 9   open_fire            9054 non-null   bool   
 10  terrace              9054 non-null   bool   
 11  terrace_area         4365 non-null   float64
 12  garden               3004 non-null   object 
 13  garden_area          2083 non-null   float64
 14  total_land_area      4520 non-null   float64
 15  nr_of_facades        6963 non

In [18]:
data['terrace_area'] = data['terrace_area'].fillna(-1)

In [19]:
# Search property where has garden = NAN
# Using the fact that: np.nan != np.nan
data[data["garden"]!=data["garden"]].shape[0]

6050

In [20]:
# Function that will replace the NaN garden value 
# by a boolean (True/False)
# True if total_land_area > (area + terrace_area)
# otherwise False
def garden_nan_to_bool_value(x):
    # Only if value is NaN
    if x["garden"] != x["garden"]:
        if x["total_land_area"] > x["area"] + x["terrace_area"]:
            return True
        else:
            return False
    return x["garden"]

In [21]:
data["garden"] = data.apply(garden_nan_to_bool_value, axis=1)

In [22]:
# Search property where has garden = NAN
# Using the fact that: np.nan != np.nan
data[data["garden"]!=data["garden"]].shape[0]

0

In [23]:
# Search property where has garden_area = NAN
# Using the fact that: np.nan != np.nan
data[(data["garden"]==True) & (data["garden_area"]!=data["garden_area"])].shape[0]

2187

In [24]:
# Function that will replace the NaN garden_are value 
# by a int
# if total_land_area > (area + terrace_area)
# otherwise 0
def garden_area_nan_to_value(x):
    # Only if value is NaN
    if x["garden_area"] != x["garden_area"]:
        if x["total_land_area"] > x["area"] + x["terrace_area"]:
            return x["total_land_area"] - x["area"] - x["terrace_area"]
        else:
            return 0
    return x["garden_area"]

In [25]:
data["garden_area"] = data.apply(garden_area_nan_to_value, axis=1)

In [26]:
data[(data["garden"]==True) & (data["garden_area"]!=data["garden_area"])].shape[0]

0

In [33]:
data['swimming_pool'] = data['swimming_pool'].fillna(False)
data['nr_of_facades'] = data['nr_of_facades'].fillna(-1)
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition,region,kitchen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,False,0.0,,4.0,False,AS_NEW,BXL,True
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,False,0.0,,3.0,False,AS_NEW,BXL,True
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,0.0,340.0,2.0,False,JUST_RENOVATED,BXL,True
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW,WAL,True
9043036,9600,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,75.0,INSTALLED,,False,False,-1.0,False,0.0,,2.0,False,GOOD,VLA,True


### Adding value

In [30]:
#replace all `nan` values by zero (0) in the column
data['equiped_kitchen'] = data['equiped_kitchen'].replace(np.nan, 0)

#create a new column with condition over 'equiped_kitchen' column
data['kitchen'] = np.where(data['equiped_kitchen'] != 0, True, False)

In [27]:
# Function that will determine the Region
# based on the Locality ZipCode
# 
#1000–1299 -> BXL-Capitale
#1300–1499 -> P Brabrant Wallon -> Wallonie
#1500–1999 -> P Brabant Flamand -> Flandre
#2000–2999 -> P Anvers -> Flandre
#3000–3499 -> P Brabant Flamand -> Flandre
#3500–3999 -> P Limbourg -> Flandre
#4000–4999 -> P Liège -> Wallonie
#5000–5999 -> P Namur -> Wallonie
#6000–6599 -> P Hainaut -> Wallonie
#6600–6999 -> P Luxembourg -> Wallonie
#7000–7999 -> P Hainaut -> Wallonie
#8000–8999 -> P Fl Occidentale -> Flandre
#9000–9999 -> P Fl Orientale -> Flandre
def get_region_on_zip_code(x):
    # Only if value is NaN
    if (1000 <= x <= 1299):
        return "BXL"
    elif (1300 <= x <= 1499) or (4000 <= x <= 7999):
        return "WAL"
    else:
        return "VLA"

In [28]:
data["region"] = data.locality.apply(get_region_on_zip_code)

In [31]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition,region,kitchen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,True,13.0,False,0.0,,4.0,,AS_NEW,BXL,True
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,True,400.0,False,0.0,,3.0,,AS_NEW,BXL,True
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,True,40.0,True,0.0,340.0,2.0,,JUST_RENOVATED,BXL,True
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,True,30.0,True,1200.0,1421.0,3.0,False,AS_NEW,WAL,True
9043036,9600,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,75.0,INSTALLED,,False,False,-1.0,False,0.0,,2.0,,GOOD,VLA,True


In [41]:
# function to recognize the province based on the postal code
def province(x):
    prov = []
    for i in x["locality"]:
        if 2000 <= i <= 2999:  # Antwerp
            prov.append("VAN")
        if 3500 <= i <= 3999:  # Limburg
            prov.append("VLI")
        if 4000 <= i <= 4999:  # Liège
            prov.append("WLG")
        if 1300 <= i <= 1499:  # Brabant Wallon
            prov.append("WBR")
        if 5000 <= i <= 5999:  # Namur
            prov.append("WNA") 
        if 6600 <= i <= 6999:  # Luxembourg
            prov.append("WLX")            
        if 1000 <= i <= 1299:  # Brussels
            prov.append("BXL")
        if (1500 <= i <= 1999) | (3000 <= i <= 3499):  # Brabant Flamand
            prov.append("VBR")
        if (6000 <= i <= 6599) | (7000 <= i <= 7999):  # Hainaut
            prov.append("WHT")            
        if 8000 <= i <= 8999:  # Flander Occidentale
            prov.append("VWV") 
        if 9000 <= i <= 9999:  # Flander Orientale
            prov.append("VOV")
    return prov

In [42]:
data["Province"] = province(data)

In [43]:
data.head()

Unnamed: 0_level_0,locality,type_of_property,subtype_of_property,price,type_of_sale,nr_of_rooms,area,equiped_kitchen,furnished,open_fire,...,terrace_area,garden,garden_area,total_land_area,nr_of_facades,swimming_pool,building_condition,region,kitchen,Province
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9044081,1083,APARTMENT,APARTMENT,265000.0,FOR_SALE,4.0,90.0,INSTALLED,False,False,...,13.0,False,0.0,,4.0,False,AS_NEW,BXL,True,BXL
9043978,1000,APARTMENT,APARTMENT,1795000.0,FOR_SALE,4.0,650.0,USA_HYPER_EQUIPPED,False,True,...,400.0,False,0.0,,3.0,False,AS_NEW,BXL,True,BXL
9044188,1050,HOUSE,MANSION,3800000.0,FOR_SALE,5.0,752.0,HYPER_EQUIPPED,False,False,...,40.0,True,0.0,340.0,2.0,False,JUST_RENOVATED,BXL,True,BXL
9041095,4860,HOUSE,HOUSE,320000.0,FOR_SALE,5.0,231.0,NOT_INSTALLED,False,False,...,30.0,True,1200.0,1421.0,3.0,False,AS_NEW,WAL,True,WLG
9043036,9600,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,75.0,INSTALLED,,False,...,-1.0,False,0.0,,2.0,False,GOOD,VLA,True,VOV


#### Convert dtype

In [44]:
data.dtypes

locality                 int64
type_of_property        object
subtype_of_property     object
price                  float64
type_of_sale            object
nr_of_rooms            float64
area                   float64
equiped_kitchen         object
furnished               object
open_fire                 bool
terrace                   bool
terrace_area           float64
garden                    bool
garden_area            float64
total_land_area        float64
nr_of_facades          float64
swimming_pool             bool
building_condition      object
region                  object
kitchen                   bool
Province                object
dtype: object

In [45]:
convert_dict = {'price': int,
                'nr_of_rooms': int,
                'furnished': bool,
                'terrace': bool,
                'garden': bool, 
                'nr_of_facades': int,
                'swimming_pool': bool 
               } 
  
data = data.astype(convert_dict) 

In [46]:
data.dtypes

locality                 int64
type_of_property        object
subtype_of_property     object
price                    int64
type_of_sale            object
nr_of_rooms              int64
area                   float64
equiped_kitchen         object
furnished                 bool
open_fire                 bool
terrace                   bool
terrace_area           float64
garden                    bool
garden_area            float64
total_land_area        float64
nr_of_facades            int64
swimming_pool             bool
building_condition      object
region                  object
kitchen                   bool
Province                object
dtype: object

### Save data cleaned

In [47]:
data.to_csv(filename_out)