In [1]:
import pandas as pd  # For data manipulation and DataFrame handling
import numpy as np  # For numerical operations and handling missing values
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler  # For feature scaling (optional for decision trees)
from sklearn.impute import SimpleImputer  # For handling missing values
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For more advanced plotting
from sklearn.tree import plot_tree  # For visualizing the decision tree
from sklearn.preprocessing import MinMaxScaler

import csv
import os

In [9]:
dataset = '../data/raw/dataset_province_municipality_code_large.csv'
df = pd.read_csv(dataset)
df = df[df['garden'] == 1]
print(df.shape)
missing_count = df['gardensurface'].isnull().sum()
print(f"Number of missing values in 'gardensurface': {missing_count}")
missing_or_zero_count = df['gardensurface'].isnull() | (df['gardensurface'] == 0)
total_count = missing_or_zero_count.sum()

print(f"Number of missing or zero values in 'gardensurface': {total_count}")



(10349, 41)
Number of missing values in 'gardensurface': 0
Number of missing or zero values in 'gardensurface': 0


In [2]:
import pandas as pd

# Load the datasets
dataset = '../data/raw/dataset_province_municipality_code_large3.csv'
df = pd.read_csv(dataset)
dataset2 = '../data/raw/rescraped4.csv'
df2 = pd.read_csv(dataset2)
# Convert column names to lowercase
df2 = df2.rename(str.lower, axis='columns')

# Fill missing values and create new columns for property types
df2['is_apartment'] = (df2['type'] == 'APARTMENT').astype(int)
df2['is_house'] = (df2['type'] == 'HOUSE').astype(int)
df2['frontages'] = df2['frontages'].fillna(2)
df2.rename(columns={
    'frontages': 'facades',
    'postcode': 'postal_code',
    'plot_surface': 'surfaceoftheplot',
    'garden_surface': 'gardensurface',
    'swimmingpool': 'pool',
    'area': 'livingarea'
}, inplace=True)
print(df2['postal_code'])
df2 = df2.dropna(subset=['postal_code'])

# Now remove any letters from the 'postal_code' column
df2['postal_code'] = df2['postal_code'].str.replace(r'[a-zA-Z]', '', regex=True)

# Convert to integers
df2['postal_code'] = df2['postal_code'].astype(int)
print(df2['postal_code'])

# Create binary columns based on state_of_building
df2['apartment_block'] = (df2['type'] == 'APARTMENT_BLOCK').astype(int)
df2['ground_floor'] = (df2['type'] == 'GROUND_FLOOR').astype(int)
df2['country_cottage'] = (df2['type'] == 'COUNTRY_COTTAGE').astype(int)
df2['mansion'] = (df2['type'] == 'MANSION').astype(int)
df2['penthouse'] = (df2['type'] == 'PENTHOUSE').astype(int)
df2['exceptional_property'] = (df2['type'] == 'EXCEPTIONAL_PROPERTY').astype(int)
df2['kot'] = (df2['type'] == 'KOT').astype(int)
df2['loft'] = (df2['type'] == 'LOFT').astype(int)
df2['manor_house'] = (df2['type'] == 'MANOR_HOUSE').astype(int)
df2['service_flat'] = (df2['type'] == 'SERVICE_FLAT').astype(int)
df2['chalet'] = (df2['type'] == 'CHALET').astype(int)

# Create binary columns based on state_of_building
df2['as_new'] = (df2['state_of_building'] == 'AS_NEW').astype(int)
df2['good'] = (df2['state_of_building'] == 'GOOD').astype(int)
df2['just_renovated'] = (df2['state_of_building'] == 'JUST_RENOVATED').astype(int)
df2['to_be_done_up'] = (df2['state_of_building'] == 'TO_BE_DONE_UP').astype(int)
df2['to_renovate'] = (df2['state_of_building'] == 'TO_RENOVATE').astype(int)
df2['to_restore'] = (df2['state_of_building'] == 'TO_RESTORE').astype(int)

# Select relevant columns for df2
columns_to_keep = ['bedrooms', 'postal_code', 'kitchen', 'facades', 'price',
                   'furnished', 'terrace', 'fireplace', 'garden', 'gardensurface', 'pool',
                   'livingarea', 'surfaceoftheplot',
                   'apartment_block', 'ground_floor', 'country_cottage', 'mansion',
                   'penthouse', 'exceptional_property', 'kot', 'loft', 'manor_house',
                   'service_flat', 'chalet', 'as_new', 'good', 'just_renovated',
                   'to_be_done_up', 'to_renovate', 'to_restore',
                   'is_apartment', 'is_house']

df2_selected = df2[columns_to_keep]

# Stack (concatenate) df2 under df
stacked_df = pd.concat([df, df2_selected], ignore_index=True)
print(stacked_df['postal_code'])

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/dataset_province_municipality_code_large3.csv'

In [None]:
communes = '../data/raw/cities.csv'
df_communes = pd.read_csv(communes)
print(df_communes.columns)
df_communes.shape


Index(['name', 'zipCode', 'nisCode', 'province', 'main'], dtype='object')


(2721, 5)

In [None]:
# Create dictionaries for lookup based on zipCode
nis_dict = dict(zip(df_communes['zipCode'], df_communes['nisCode']))
locality_dict = dict(zip(df_communes['zipCode'], df_communes['name']))
province_dict = dict(zip(df_communes['zipCode'], df_communes['province']))

# Map the dictionaries to create new columns in stacked_df
stacked_df['municipality_code'] = stacked_df['postal_code'].map(nis_dict)  # Adds municipality_code
stacked_df['locality'] = stacked_df['postal_code'].map(locality_dict)      # Adds locality
stacked_df['province'] = stacked_df['postal_code'].map(province_dict)      # Adds province

stacked_df.drop_duplicates(inplace=True)



In [None]:
# Drop rows with NaN in the 'postal_code' column
stacked_df = stacked_df.dropna(subset=['livingarea'])
stacked_df = stacked_df.dropna(subset=['municipality_code'])
stacked_df = stacked_df.dropna(subset=['locality'])
stacked_df = stacked_df.dropna(subset=['bedrooms'])
stacked_df['gardensurface'] = stacked_df['gardensurface'].fillna(0)
stacked_df['surfaceoftheplot'] = stacked_df['surfaceoftheplot'].fillna(0)
stacked_df['bedrooms'] = stacked_df['bedrooms'].astype(int)
stacked_df = stacked_df.drop_duplicates()

missing_values = stacked_df.isna().sum()
print(f"Number of missing values: \n{missing_values}")

Number of missing values: 
Unnamed: 0.3             9750
Unnamed: 0.2            13442
Unnamed: 0.1            15958
Unnamed: 0              26599
bedrooms                    0
postal_code                 0
kitchen                     0
facades                     0
price                       0
furnished                   0
terrace                     0
fireplace                   0
garden                      0
gardensurface               0
pool                        0
livingarea                  0
surfaceoftheplot            0
municipality_code           0
apartment_block             0
ground_floor                0
country_cottage             0
mansion                     0
penthouse                   0
exceptional_property        0
kot                         0
loft                        0
manor_house                 0
service_flat                0
chalet                      0
as_new                      0
good                        0
just_renovated              0
to_be_done_up

In [None]:
stacked_df.to_csv('dataset_province_municipality_code_large4.csv')


In [None]:
print(stacked_df.columns)

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       'bedrooms', 'postal_code', 'kitchen', 'facades', 'price', 'furnished',
       'terrace', 'fireplace', 'garden', 'gardensurface', 'pool', 'livingarea',
       'surfaceoftheplot', 'municipality_code', 'apartment_block',
       'ground_floor', 'country_cottage', 'mansion', 'penthouse',
       'exceptional_property', 'kot', 'loft', 'manor_house', 'service_flat',
       'chalet', 'as_new', 'good', 'just_renovated', 'to_be_done_up',
       'to_renovate', 'to_restore', 'price_sqm', 'is_apartment', 'is_house',
       'locality', 'province'],
      dtype='object')
