In [2]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
from clean_dataset import import_csv_data

In [3]:
file_path = 'data\properties06191148_modified.csv'
raw_df = import_csv_data(file_path)


### Change columns names

In [5]:
raw_df.columns

Index(['zimmo code', 'type', 'price', 'street', 'number', 'postcode', 'city',
       'living area(m²)', 'ground area(m²)', 'bedroom', 'bathroom', 'garage',
       'garden', 'EPC(kWh/m²)', 'renovation obligation', 'year built',
       'mobiscore', 'url'],
      dtype='object')

In [6]:
def join_column_names(df:pd) -> None:
    columns_new_names = dict()
    for column in df.columns:
        s = "_".join(column.split())
        if s != column:
            columns_new_names[column] = s

    df.rename(columns=columns_new_names, inplace=True)

In [12]:
join_column_names(raw_df)

#### Duplicates

In [4]:
raw_df[raw_df.duplicated()]

Unnamed: 0,zimmo code,type,price,street,number,postcode,city,living area(m²),ground area(m²),bedroom,bathroom,garage,garden,EPC(kWh/m²),renovation obligation,year built,mobiscore,url


In [7]:
len(raw_df)

25403

In [8]:
raw_df.drop_duplicates(subset=['zimmo code'], inplace=True)

In [9]:
len(raw_df)

25403

In [30]:
raw_df.head()

Unnamed: 0,zimmo code,type,price,street,number,postcode,city,living area(m²),ground area(m²),bedroom,bathroom,garage,garden,EPC(kWh/m²),renovation obligation,year built,mobiscore,url
0,L97OB,Vakantiewoning (Huis),25000.0,,,8620,Nieuwpoort,35.0,128.0,2.0,1.0,,False,,False,,7.0,https://www.zimmo.be/nl/nieuwpoort-8620/te-koo...
1,L9SVC,Appartement,45000.0,,,5570,Beauraing,62.0,,2.0,1.0,,False,,,,,https://www.zimmo.be/nl/beauraing-5570/te-koop...
2,LA02N,Rijwoning (Huis),45000.0,Oudestraat,94.0,9600,Ronse,,232.0,2.0,1.0,,False,716.0,True,1850.0,7.3,https://www.zimmo.be/nl/ronse-9600/te-koop/hui...
3,L4X2D,Vakantiewoning (Huis),40000.0,Molenheidestraat,7.0,3530,Helchteren,45.0,,,,,False,,False,,5.3,https://www.zimmo.be/nl/helchteren-3530/te-koo...
4,L9KJ7,Eengezinswoning (Huis),49900.0,Route Napoléon,10.0,4400,Ivoz-Ramet,123.0,8885.0,2.0,1.0,,False,569.0,,,,https://www.zimmo.be/nl/ivoz-ramet-4400/te-koo...


#### Bedroom

In [None]:
raw_df['bedroom'].describe()


count    22287.000000
mean         3.144030
std          1.762147
min          0.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         79.000000
Name: bedroom, dtype: float64

In [39]:
# Null values
number_of_nan_bedrooms = len(raw_df[raw_df['bedroom'].isna()])
number_of_nan_bedrooms

3116

#### Bathroom

In [42]:
raw_df['bathroom'].describe()

count    20773.000000
mean         1.482116
std          1.073296
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         29.000000
Name: bathroom, dtype: float64

In [43]:
number_of_nan_bathrooms = len(raw_df[raw_df['bathroom'].isna()])
number_of_nan_bathrooms

4630

#### Garage

In [44]:
raw_df['garage'].describe()

count    8859.000000
mean        1.385258
std         2.617225
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max       110.000000
Name: garage, dtype: float64

In [52]:
number_of_nan_garage = len(raw_df[raw_df['garage'].isna()])
number_of_nan_garage 

0

##### Replace NaN elements with 0's

In [82]:
raw_df['garage'].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_df['garage'].fillna(0,inplace=True)


In [83]:
number_of_nan_garage = len(raw_df[raw_df['garage'].isna()])
number_of_nan_garage 

0

In [None]:
join_column_names(raw_df)

#### Garden

In [49]:
raw_df['garden'].describe()

count     25403
unique        2
top       False
freq      20835
Name: garden, dtype: object

In [51]:
number_of_nan_garden = len(raw_df[raw_df['garden'].isna()])
number_of_nan_garden

0

#### EPC

In [53]:
raw_df['EPC(kWh/m²)'].describe()

count    17467.000000
mean       309.127727
std        454.993049
min          0.000000
25%        139.000000
50%        237.000000
75%        383.000000
max      14954.000000
Name: EPC(kWh/m²), dtype: float64

In [55]:
number_of_nan_EPC = len(raw_df[raw_df['EPC(kWh/m²)'].isna()])
number_of_nan_EPC

7936

#### Renovation Obligation

In [56]:
raw_df['renovation obligation'].describe()


count     19275
unique        2
top       False
freq      16259
Name: renovation obligation, dtype: object

In [57]:
number_of_nan_ren = len(raw_df[raw_df['renovation obligation'].isna()])
number_of_nan_ren

6128

In [58]:
raw_df['year built'].describe()


count    17761.000000
mean      1980.894488
std         57.726582
min          6.000000
25%       1961.000000
50%       1989.000000
75%       2022.000000
max       3025.000000
Name: year built, dtype: float64

In [59]:
number_of_nan_year= len(raw_df[raw_df['year built'].isna()])
number_of_nan_year

7642

In [61]:
raw_df['mobiscore'].describe()


count    20103.000000
mean         7.855982
std          1.216390
min          3.300000
25%          7.000000
50%          8.100000
75%          8.800000
max          9.900000
Name: mobiscore, dtype: float64

In [63]:
number_of_nan_mobi = len(raw_df[raw_df['mobiscore'].isna()])
number_of_nan_mobi

5300

In [91]:
raw_df['postcode'] = raw_df['postcode'].astype('category')

In [1]:
sns.swarmplot(x='postcode', y='mobiscore', data=raw_df.head(50)) 



NameError: name 'sns' is not defined

#### Checking attributes types

In [13]:
raw_df.dtypes

zimmo_code                object
type                      object
price                    float64
street                    object
number                    object
postcode                  object
city                      object
living_area(m²)          float64
ground_area(m²)          float64
bedroom                  float64
bathroom                 float64
garage                   float64
garden                      bool
EPC(kWh/m²)              float64
renovation_obligation     object
year_built               float64
mobiscore                float64
url                       object
dtype: object

##### Check to see if several attributes have NaN values

In [14]:
raw_df[raw_df[['bedroom', 'bathroom', 'living_area(m²)', 'ground_area(m²)']].isna().all(axis=1)]

Unnamed: 0,zimmo_code,type,price,street,number,postcode,city,living_area(m²),ground_area(m²),bedroom,bathroom,garage,garden,EPC(kWh/m²),renovation_obligation,year_built,mobiscore,url
33,L5O5X,Vakantiewoning (Huis),42500.0,Parelstrand,154,3920,Lommel,,,,,,False,,False,,6.8,https://www.zimmo.be/nl/lommel-3920/te-koop/hu...
34,L7MF9,Appartement,38500.0,Dorlodotlaan,8,8670,Koksijde,,,,,,False,,False,1975.0,8.1,https://www.zimmo.be/nl/koksijde-8670/te-koop/...
44,L3Q5V,Gemengd gebruik (Huis),25000.0,Rue Appâa,15,7340,Wasmes,,,,,1.0,False,,,,,https://www.zimmo.be/nl/wasmes-7340/te-koop/hu...
49,L3VPC,Woning (Huis),29000.0,Morelgem,,9520,Vlierzele,,,,,,False,,False,,,https://www.zimmo.be/nl/vlierzele-9520/te-koop...
57,KXLWR,Studio met slaaphoek (Appartement),29990.0,krommedijk,,8301,Duinbergen,,,,,,False,0.0,False,,8.4,https://www.zimmo.be/nl/duinbergen-8301/te-koo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25196,L23Q1,Woning (Huis),2200000.0,Wayezstraat,,1070,Anderlecht,,,,,,False,,,,9.6,https://www.zimmo.be/nl/anderlecht-1070/te-koo...
25226,L0J0D,Woning (Huis),3209250.0,Prinsenstraat NB,,8940,Wervik,,,,,,False,,False,,7.8,https://www.zimmo.be/nl/wervik-8940/te-koop/hu...
25236,L0G6C,Appartement,2890000.0,Sterrenstraat,52,2500,Lier,,,,,,False,30.0,False,2025.0,8.4,https://www.zimmo.be/nl/lier-2500/te-koop/appa...
25245,KZQPD,Uitzonderlijke woning (Huis),1850000.0,,,3000,Leuven,,,,,,False,249.0,False,,9.5,https://www.zimmo.be/nl/leuven-3000/te-koop/hu...


##### Dropping the url columns

In [115]:
clean_df = raw_df.drop(columns={'url'})