In [1]:
%autosave 0
import numpy 
import matplotlib
import pickle
import pandas as pd
import numpy as np 
import tensorflow as tf
from pandas import read_csv
from matplotlib import pyplot
from pandas.plotting import scatter_matrix 
from numpy import set_printoptions 
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.compose import ColumnTransformer

set_printoptions(precision=0)
pd.set_option("display.precision", 2)
pd.option_context('display.float_format', '{:0f}'.format);

seed = 15
#tf.random.set_seed(seed)
tf.set_random_seed(seed)
numpy.random.seed(seed)

Autosave disabled


In [2]:
# Importation du fichier csv dans un dataframe
# df_train_full = read_csv( './dataset-template.csv', index_col=0)
df_train_full = read_csv( './csv/realstate-data.csv', index_col=0)

In [3]:
print(df_train_full.shape)
df_train_full.head(5)

(7200, 12)


Unnamed: 0,ref,city,county,district,area_m2,ground_m2,nb_room,nb_bedroom,pool,cellar,garage,output
0,/fr/properties/30731a-133656.htm,Saint-Maurin,Lot-et-Garonne,Aquitaine,100.0,,5.0,3.0,,,,210000
1,/fr/properties/88486a-95148cj265.htm,Castelnau-de-Médoc,Gironde,Aquitaine,328.0,3700.0,9.0,4.0,,,True,840000
2,/fr/properties/44945a-m728.htm,Peyrehorade,Landes,Aquitaine,814.0,100000.0,19.0,8.0,True,True,,1950000
3,/fr/properties/32488a-2534.htm,Bourlens,Lot-et-Garonne,Aquitaine,406.0,13200.0,14.0,10.0,,,,975200
4,/fr/properties/32488a-2557.htm,Monsempron-Libos,Lot-et-Garonne,Aquitaine,99.0,35.0,4.0,3.0,,,,101520


In [4]:
df_train_full.describe()

Unnamed: 0,area_m2,ground_m2,nb_room,nb_bedroom,output
count,6880.0,5740.0,6311.0,6221.0,7200.0
mean,313.85,24500.0,7.1,4.52,453000.0
std,1052.33,363000.0,3.77,2.34,630000.0
min,1.0,1.0,2.0,2.0,8000.0
25%,125.0,1110.0,5.0,3.0,198000.0
50%,180.0,3000.0,6.0,4.0,311000.0
75%,280.0,9480.0,8.0,5.0,498000.0
max,29760.0,26900000.0,70.0,50.0,16000000.0


In [5]:
Y_trainfull = df_train_full['output']
X_trainfull = df_train_full.drop('output', axis=1)

In [6]:
print(Y_trainfull.shape)
print(X_trainfull.shape)

(7200,)
(7200, 11)


# Traitement des variables catégorielles

In [7]:
cat_columns = ['city', 'county', 'district']

In [8]:
X_trainfull = pd.get_dummies(X_trainfull, columns = cat_columns)

# Traitement des valeurs manquantes

In [9]:
df_train_full.isna().any()

ref           False
city          False
county        False
district      False
area_m2        True
ground_m2      True
nb_room        True
nb_bedroom     True
pool           True
cellar         True
garage         True
output        False
dtype: bool

In [10]:
boolean_columns = ['pool', 'cellar', 'garage']
area_column = ['area_m2']
ground_column = ['ground_m2']
nb_room_column = ['nb_room']
nb_bedroom_column = ['nb_bedroom']

In [11]:
boolean_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('imputer1', SimpleImputer(missing_values='True', strategy='constant', fill_value=1))
])

In [12]:
area_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [13]:
ground_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

In [14]:
room_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1))
])

In [15]:
bedroom_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1))
])

# Construction du pipeline de pré-traitement

In [16]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('bool', boolean_pipe, boolean_columns),
    ('area', area_pipe, area_column),
    ('ground', ground_pipe, ground_column),
    ('room', room_pipe, nb_room_column),
    ('bedroom', bedroom_pipe, nb_bedroom_column),
])

NameError: name 'categorical_pipe' is not defined

# Choix du modèle et intégration dans la pipeline

In [None]:
model = LinearRegression()

In [None]:
full_pipe = Pipeline(steps=[
    ('pp', preprocess_pipe),
    ('reg', model)
])

# Séparations des données en ensemble d'entraînement et de validation

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_trainfull, Y_trainfull, random_state = seed)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

# Création modèle

In [None]:
 full_pipe.fit(X_train, Y_train);

In [None]:
train_u = X_train['city'].unique
val_u = X_val['city'].unique

In [None]:
y_train_pred = full_pipe.predict(X_train)
y_val_pred = full_pipe.predict(X_val)

In [None]:
mae_train = round(metrics.mean_absolute_error(Y_train, y_train_pred), 0)
mae_val = round(metrics.mean_absolute_error(Y_val, y_val_pred), 0)
print('MAE sur train : ', mae_train, '€')
print('MAE sur val : ', mae_val, '€')

In [None]:
pickle.dump(model, open( "model.file", "wb" ))