In [3]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
!pip install missingno lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/b8/9d/1ce80cee7c5ef60f2fcc7e9fa97f29f7a8de3dc5a08922b3b2f1e9106481/lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl.metadata
  Downloading lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Downloading lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.1.0
[0m

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
import missingno
# from sklearn.preprocessing import preprocessing_pipeline
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV

1. seloger.com
2. bienici.com
3. logic-immo.com
4. al-in.fr
5. pap.fr.

In [16]:
continuous_numerical = [ 
    'prix', # price
    'valeur_energie', # energy performance value
    'valeur_ges', # greenhouse gas emission performance value
    'latitude',
    'longitude',
    'superficie', # living area 
    'superficie_terrain', # land size
    ]
discrete_numerical = [
    'nombre_pieces', #number of rooms
    'nombre_chambres', #number of bedrooms
    'nombre_salles_de_bain', #number of bathrooms
    'etage', #floor
    'nombre_box', #number of boxes
    'nombre_terrasses', #number of terrasses
    'nombre_parkings', #number of parking spots
    'nombre_photos'#number of photos attached to the listing
]
binary = [ 
    'presence_balcon', # has a balcony
    'presence_cave', # has a cellar
    'presence_garage', # has a garage
    'presence_climatisation', # has air conditionning
    'etage_dernier_etage', # on the top floor
    'etage_etage_eleve', # in the upper floors
    'etage_rez_de_jardin', # on the ground floor
    'superficie_multiples_biens', # combined area of several 
                                  # properties
    'is_neuf', # is new
    ]
categorical = [
    'categorie_energie', #energy performance category
    'categorie_ges', #greenhouse gas emission category
    'ville', #city
    'departement', #postal code
    'type_client', #type of client advertising the listing
    'source', #source of the listing
    'exposition', #direction the property is facing
    'type_bien', #property type
]

columns=set(continuous_numerical+discrete_numerical+binary+categorical)
columns

{'categorie_energie',
 'categorie_ges',
 'departement',
 'etage',
 'etage_dernier_etage',
 'etage_etage_eleve',
 'etage_rez_de_jardin',
 'exposition',
 'is_neuf',
 'latitude',
 'longitude',
 'nombre_box',
 'nombre_chambres',
 'nombre_parkings',
 'nombre_photos',
 'nombre_pieces',
 'nombre_salles_de_bain',
 'nombre_terrasses',
 'presence_balcon',
 'presence_cave',
 'presence_climatisation',
 'presence_garage',
 'prix',
 'source',
 'superficie',
 'superficie_multiples_biens',
 'superficie_terrain',
 'type_bien',
 'type_client',
 'valeur_energie',
 'valeur_ges',
 'ville'}

In [None]:

file_path = "real_estate_data.csv"
data = pd.read_csv(file_path, sep=',', index_col=0)
train_data, test_data = train_test_split(data, train_size=int(3e5),    
                                         random_state=42)
df = train_data

In [None]:
df.duplicated(keep = False).sum()
# returns 0

In [None]:
df.drop_duplicates(subset = "id_annonce", keep=False, inplace=True)
df.drop_duplicates(subset = "description", keep=False, inplace=True)

In [None]:
df.isna().sum()/len(df)

In [None]:
df = df.dropna(subset = ['prix', 'latitude', 'longitude'])

In [None]:
df.loc[df.type_bien.isin(['appartement', 'loft', 'chambre']) \
       & df.superficie_terrain.isna(), 'superficie_terrain'] = 0
rows = df.type_bien.isin(['terrain à bâtir', 'terrain']) \
                          & df.superficie_terrain.isna() \
                          & ~df.superficie.isna()
df.loc[rows, 'superficie_terrain'] = df.loc[rows, 'superficie']
df.loc[rows, 'superficie'] = 0

In [None]:

missingno.matrix(df[['categorie_energie', 'valeur_energie',  
                     'categorie_ges', 'valeur_ges']])

In [None]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
for col in df.columns:
    if col != 'prix':
        if is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].median())
        elif is_string_dtype(df[col]):
            df[col] = df[col].fillna('Unknown')

In [None]:
continuous_numerical = [ 
    'prix', # price
    'valeur_energie', # energy performance value
    'valeur_ges', # greenhouse gas emission performance value
    'latitude',
    'longitude',
    'superficie', # living area 
    'superficie_terrain', # land size
    ]
df[continuous_numerical].describe([0.1, 0.25, 0.5, 0.75, 0.9, 
                                 0.99]).round()

In [None]:
discrete_numerical = [
    'nombre_pieces', #number of rooms
    'nombre_chambres', #number of bedrooms
    'nombre_salles_de_bain', #number of bathrooms
    'etage', #floor
    'nombre_box', #number of boxes
    'nombre_terrasses', #number of terrasses
    'nombre_parkings', #number of parking spots
    'nombre_photos'#number of photos attached to the listing
]
df[discrete_numerical].describe([0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).round()

In [None]:
def remove_outliers(df, lower_outliers, q_bottom, upper_outliers, 
                    q_top):
    lower_quantiles = df[lower_outliers].quantile(q_bottom)
    for col in lower_outliers:
        df = df[df[col] >= lower_quantiles[col]]
    upper_quantiles = df[upper_outliers].quantile(q_top)
    for col in upper_outliers:
        df = df[df[col] <= upper_quantiles[col]]
    return df
upper_outliers = ['prix', 'valeur_ges', 'valeur_energie', 
                  'superficie', 'superficie_terrain']
upper_outliers += discrete_numerical
lower_outliers = ['prix']
df = remove_outliers(df, lower_outliers, 0.01, upper_outliers, 0.99)

In [None]:
binary = [ 
    'presence_balcon', # has a balcony
    'presence_cave', # has a cellar
    'presence_garage', # has a garage
    'presence_climatisation', # has air conditionning
    'etage_dernier_etage', # on the top floor
    'etage_etage_eleve', # in the upper floors
    'etage_rez_de_jardin', # on the ground floor
    'superficie_multiples_biens', # combined area of several 
                                  # properties
    'is_neuf', # is new
    ]
binary_rates = 100*df[binary].sum()/len(df)

In [None]:
categorical = [
    'categorie_energie', #energy performance category
    'categorie_ges', #greenhouse gas emission category
    'ville', #city
    'departement', #postal code
    'type_client', #type of client advertising the listing
    'source', #source of the listing
    'exposition', #direction the property is facing
    'type_bien', #property type
]
modality_counts = df[categorical].nunique()

In [None]:
for col in ['categorie_energie', 'categorie_ges']:
        df[col] = df[col].replace({'A':0, 
                                   'B':1, 
                                   'C':2, 
                                   'D':3, 
                                   'E':4, 
                                   'F':5, 
                                   'G':6, 
                                   'Unknown': None})
        df[col] = df[col].fillna(value=df[col].median())

In [None]:
type_shares = df['type_bien'].value_counts(normalize = True)
minor_types = type_shares[type_shares<0.01].index.tolist()
df['type_bien'] = df['type_bien'].replace(minor_types, 'autre')

In [None]:
dummies = ['type_bien', 'type_client', 'source', 'exposition']
df = pd.get_dummies(df, columns=dummies)

In [None]:
df = df[df.description.apply(len)>100]

In [None]:

train_data = preprocessing_pipeline.fit_transform(train_data)
test_data = preprocessing_pipeline.transform(test_data)
train_data = train_data.drop(['titre', 'description','id_annonce', 
                              'ville', 'departement'], axis = 1)
test_data = test_data.drop(['titre', 'description','id_annonce', 
                            'ville', 'departement'], axis = 1)

In [None]:

estimator = LGBMRegressor(random_state = 42)

In [None]:

parameters = {
    'num_leaves' : [10, 30, 50, 100, 200],
    'max_depth': [None, 5, 10, 20, 50],
    'n_estimators': [150, 200, 400, 600],
    'learning_rate': [0.05, 0.1, 0.25, 0.5]
    }
model = RandomizedSearchCV(estimator, parameters, random_state=42, 
                           scoring = 'r2', n_iter = 50)

In [None]:
X_train = train_data.drop(['prix'], axis = 1)
y_train = train_data.prix
model.fit(X_train, y_train)

In [None]:
X_test = test_data.drop(['prix'], axis = 1)
y_test = test_data.prix
y_pred = model.best_estimator_.predict(X_test)

In [None]:

mae = mean_absolute_error(y_test, y_pred)
mdae = median_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mdape = ((pd.Series(y_test.to_numpy()) \
         - pd.Series(y_pred))\
         / pd.Series(y_test.to_numpy())).abs().median()
r_squared = r2_score(y_test, y_pred)

In [None]:
feature_importances = \
            pd.DataFrame(model.best_estimator_.feature_importances_,
                 index = X_train.columns, columns = ['importance'] \
                 ).sort_values('importance', ascending=False)