In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('data/raw_dataset_dropPrice_str.csv')

# Data exploration

### Remove depplicated ID

In [12]:
df['Immoweb ID'].value_counts()

9691876    4
9662650    3
9597143    3
8237073    3
9687272    3
          ..
9623733    1
9245768    1
9606238    1
9695276    1
7921699    1
Name: Immoweb ID, Length: 13893, dtype: int64

In [13]:
df = df.drop_duplicates(['Immoweb ID'], keep='last')
df['Immoweb ID'].value_counts()

9729720    1
9678715    1
9697071    1
9557609    1
9678634    1
          ..
9518030    1
9622950    1
9478102    1
9620790    1
7921699    1
Name: Immoweb ID, Length: 13893, dtype: int64

### Remove subtype 

In [14]:
df = df[df['property sub-type'] != "APARTMENT_BLOCK"]
df['property sub-type'].value_counts()

VILLA                   2468
GROUND_FLOOR            1606
DUPLEX                  1580
MIXED_USE_BUILDING      1240
PENTHOUSE               1170
FLAT_STUDIO              873
EXCEPTIONAL_PROPERTY     566
SERVICE_FLAT             500
MANSION                  479
TOWN_HOUSE               383
COUNTRY_COTTAGE          292
LOFT                     271
BUNGALOW                 212
FARMHOUSE                171
TRIPLEX                   99
KOT                       95
CHALET                    94
MANOR_HOUSE               62
CASTLE                    50
Name: property sub-type, dtype: int64

# Data Cleaning

### Building condition

In [15]:
df['Building condition'].value_counts()
df['Building condition'].unique()

array(['Good', nan, 'As new', 'To renovate', 'To be done up',
       'Just renovated', 'To restore'], dtype=object)

In [16]:
building_condition_map = {'As new': 6, 'Just renovated': 5, 'Good': 4, 'To be done up': 3, 'To renovate':2, 'To restore':1}
df = df.applymap(lambda s: building_condition_map.get(s) if s in building_condition_map else s)

df[df['Building condition'].isnull()]


Unnamed: 0.1,Unnamed: 0,Immoweb ID,Property type,property sub-type,Price,Post code,Building condition,Kitchen type,Bedrooms,Furnished,Terrace surface,Tenement building,Number of frontages,Swimming pool,How many fireplaces?,Garden,Garden orientation,Garden surface,Terrace,Surface of the plot
6,6,9727340,APARTMENT,PENTHOUSE,1450000.0,1180,,,,,,No,,,,,,,,
7,7,9727352,APARTMENT,PENTHOUSE,1900000.0,1060,,,,,,No,,,,,,,,
9,9,9725517,APARTMENT,PENTHOUSE,458000.0,1180,,,,,,No,,,,,,,,
10,10,9724579,APARTMENT,PENTHOUSE,189500.0,2870,,,,,,No,,,,,,,,
13,13,9716762,APARTMENT,PENTHOUSE,285000.0,2220,,,,,,No,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,14533,9581654,APARTMENT,SERVICE_FLAT,253515.0,3500,,,1.0,,8.0,No,,,,,,,,
14136,14534,9463951,APARTMENT,SERVICE_FLAT,250200.0,2320,,,1.0,,,No,,,,,,,,
14137,14535,9463953,APARTMENT,SERVICE_FLAT,204760.0,2870,,,1.0,,,No,,,,,,,,
14141,14539,7921693,APARTMENT,SERVICE_FLAT,1350000.0,5600,,,30.0,,,No,,,,,,,,


In [17]:
df['Building condition'].isnull().sum()


3828

In [18]:
df['Building condition'] = df['Building condition'].fillna(2)
df['Building condition'].isnull().sum()

0

### Kitchen type	


### Bedrooms


### Furnished


In [19]:
df['Furnished'].unique()

array(['Yes', 'No', nan], dtype=object)

In [20]:
df['Furnished'].isnull().sum()

4080

#### fill No to missing value then map to 1,0

In [21]:
df['Furnished'] = df['Furnished'].fillna("No")
df['Furnished'] = df['Furnished'].apply(lambda v: 0 if v == "No" else 1)

#### double check if value is 1, 0 and no missing value

In [22]:
print(df['Furnished'].unique())
print(df['Furnished'].isna().sum())

[1 0]
0


### Terrace surface + Terrace	


### Tenement building	


### Swimming pool	


### How many fireplaces?	


### Garden + Garden surface	


In [23]:
print(df['Garden'].isna().sum())
print(df['Garden'].unique())

11051
[nan 'Yes']


In [24]:
df.loc[df['Garden surface'].isna()]

Unnamed: 0.1,Unnamed: 0,Immoweb ID,Property type,property sub-type,Price,Post code,Building condition,Kitchen type,Bedrooms,Furnished,Terrace surface,Tenement building,Number of frontages,Swimming pool,How many fireplaces?,Garden,Garden orientation,Garden surface,Terrace,Surface of the plot
0,0,9729720,APARTMENT,PENTHOUSE,179000.0,1140,4.0,Semi equipped,1.0,1,9.0,No,,,,,,,,
1,1,9729785,APARTMENT,PENTHOUSE,255000.0,8370,4.0,Installed,1.0,1,46.0,No,2.0,,,,,,,
2,2,9729784,APARTMENT,PENTHOUSE,255000.0,8370,4.0,Installed,1.0,1,46.0,No,2.0,,,,,,,
3,3,9729780,APARTMENT,PENTHOUSE,620000.0,8370,4.0,Installed,3.0,1,70.0,No,2.0,,,,,,,
4,4,9727201,APARTMENT,PENTHOUSE,379000.0,1020,4.0,USA hyper equipped,2.0,0,110.0,No,3.0,No,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14138,14536,7770552,APARTMENT,SERVICE_FLAT,132000.0,1730,6.0,,1.0,0,,No,,,,,,,,
14139,14537,7770553,APARTMENT,SERVICE_FLAT,177000.0,1730,6.0,,1.0,0,,No,,,,,,,Yes,
14140,14538,7770551,APARTMENT,SERVICE_FLAT,190000.0,1730,6.0,,2.0,0,,No,,,,,,,Yes,
14141,14539,7921693,APARTMENT,SERVICE_FLAT,1350000.0,5600,2.0,,30.0,0,,No,,,,,,,,


#### Combine ['Garden'] and ['Garden surface] as ['garden_label] and map to 1,0

In [25]:
def categorise(col):  
    if col['Garden'] == "Yes" or col['Garden surface'] > 0:
        return 1
    return 0


df['garden_label'] = df.apply(lambda col: categorise(col), axis=1)

#### double check if value is 1, 0 and no missing value

In [27]:
df['garden_label'].unique()

array([0, 1])

#### drop column Garden and Garden surface

### Garden orientation	


### Surface of the plot




### Final check missing value %




In [28]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Unnamed: 0,Unnamed: 0,0.0
Immoweb ID,Immoweb ID,0.0
Property type,Property type,0.0
property sub-type,property sub-type,0.0
Price,Price,0.0
Post code,Post code,0.0
Building condition,Building condition,0.0
Kitchen type,Kitchen type,37.318811
Bedrooms,Bedrooms,14.904594
Furnished,Furnished,0.0


# Data Analysis

### Group Post code to region

In [2]:
%pip install pgeocode

Collecting pgeocode
  Using cached pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Collecting requests
  Using cached requests-2.27.1-py2.py3-none-any.whl (63 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.3-py3-none-any.whl (61 kB)
Collecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.11-py3-none-any.whl (39 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2021.10.8-py2.py3-none-any.whl (149 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests, pgeocode
Successfully installed certifi-2021.10.8 charset-normalizer-2.0.11 idna-3.3 pgeocode-0.3.0 requests-2.27.1 urllib3-1.26.8
You should consider upgrading via the '/Users/hsinhan/PycharmProjects/challenge_immo_eliza_analysis/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
import pgeocode
nomi = pgeocode.Nominatim("be")
nomi.query_postal_code("2000")['state_name']

postal_code             2000
country_code              BE
place_name         Antwerpen
state_name        Vlaanderen
state_code               VLG
county_name           Anvers
county_code              VAN
community_name     Antwerpen
community_code            11
latitude             51.2199
longitude             4.4035
accuracy                   4
Name: 0, dtype: object

In [30]:
nomi.query_postal_code("5030")['state_name']

'Wallonie'

In [31]:
test = df
def get_state(col):  
    region = nomi.query_postal_code(col['Post code'])['state_name']
    return region


test['region'] = test.apply(lambda col: get_state(col), axis=1)


0        Bruxelles-Capitale
1                Vlaanderen
2                Vlaanderen
3                Vlaanderen
4        Bruxelles-Capitale
                ...        
14138            Vlaanderen
14139            Vlaanderen
14140            Vlaanderen
14141              Wallonie
14142              Wallonie
Name: region, Length: 12211, dtype: object

In [32]:
test

Unnamed: 0.1,Unnamed: 0,Immoweb ID,Property type,property sub-type,Price,Post code,Building condition,Kitchen type,Bedrooms,Furnished,...,Number of frontages,Swimming pool,How many fireplaces?,Garden,Garden orientation,Garden surface,Terrace,Surface of the plot,garden_label,region
0,0,9729720,APARTMENT,PENTHOUSE,179000.0,1140,4.0,Semi equipped,1.0,1,...,,,,,,,,,0,Bruxelles-Capitale
1,1,9729785,APARTMENT,PENTHOUSE,255000.0,8370,4.0,Installed,1.0,1,...,2.0,,,,,,,,0,Vlaanderen
2,2,9729784,APARTMENT,PENTHOUSE,255000.0,8370,4.0,Installed,1.0,1,...,2.0,,,,,,,,0,Vlaanderen
3,3,9729780,APARTMENT,PENTHOUSE,620000.0,8370,4.0,Installed,3.0,1,...,2.0,,,,,,,,0,Vlaanderen
4,4,9727201,APARTMENT,PENTHOUSE,379000.0,1020,4.0,USA hyper equipped,2.0,0,...,3.0,No,,,,,,,0,Bruxelles-Capitale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14138,14536,7770552,APARTMENT,SERVICE_FLAT,132000.0,1730,6.0,,1.0,0,...,,,,,,,,,0,Vlaanderen
14139,14537,7770553,APARTMENT,SERVICE_FLAT,177000.0,1730,6.0,,1.0,0,...,,,,,,,Yes,,0,Vlaanderen
14140,14538,7770551,APARTMENT,SERVICE_FLAT,190000.0,1730,6.0,,2.0,0,...,,,,,,,Yes,,0,Vlaanderen
14141,14539,7921693,APARTMENT,SERVICE_FLAT,1350000.0,5600,2.0,,30.0,0,...,,,,,,,,,0,Wallonie
