<div style="display:flex; justify-content: center; background-color: #8038DB">
<h1 style="margin:auto; padding: 25px;">
Procédure pour trouver un modele adapté
</h1>
</div>

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import learning_curve

from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
#import warnings
#warnings.simplefilter(action="ignore")

In [297]:
df1 = pd.read_csv("./data/train1.csv")

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0.1        16512 non-null  int64  
 1   Unnamed: 0          16512 non-null  float64
 2   longitude           16512 non-null  float64
 3   latitude            16512 non-null  float64
 4   housing_median_age  16512 non-null  float64
 5   total_rooms         16512 non-null  float64
 6   total_bedrooms      16512 non-null  float64
 7   population          16512 non-null  float64
 8   households          16512 non-null  float64
 9   median_income       16512 non-null  float64
 10  median_house_value  16512 non-null  float64
 11  blue_near           16512 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 1.5 MB


<div style="display:flex; justify-content: center; ">
<h2 style="margin:auto; padding: 20px; color: #8038DB">
Dummy Model
</h2>
</div>

In [25]:
y = df1["median_house_value"]
X = df1.drop("median_house_value", axis=1)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [27]:
dummy_reg = DummyRegressor(strategy="mean")

# fit the modem
dummy_reg.fit(X_train, y_train)

# calculate the score
dummy_reg.score(X_test, y_test)

-0.00036290962536944527

<div style="display:flex; justify-content: center; ">
<h2 style="margin:auto; padding: 20px; color: #8038DB">
Linear Regression
</h2>
</div>


In [29]:
y = df1["median_house_value"]
X = df1.drop("median_house_value", axis=1)

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1) 

In [274]:
linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

# calculate the score
linear_reg.score(X_test, y_test)

0.6324436794975469

<div style="display:flex; justify-content: center; ">
<h2 style="margin:auto; padding: 20px; color: #8038DB">
Linear regression + Scale 
</h2>
</div>

In [289]:
# drop column 
df3= df1.drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1)

In [284]:
Y_train = df3["median_house_value"]
X_train = df3.drop(["median_house_value"],axis=1)

In [290]:
X_train_rl, X_test_rl, y_train_rl, y_test_rl = train_test_split(X_train, Y_train, test_size=0.3, random_state=1)

In [291]:
y_train_rl = y_train_rl.values.reshape(-1,1)
y_test_rl = y_test_rl.values.reshape(-1,1)

In [292]:
sc = StandardScaler()
X_train_rl = sc.fit_transform(X_train_rl)
X_test_rl = sc.fit_transform(X_test_rl)
y_train_rl = sc.fit_transform(y_train_rl)
y_test_rl = sc.fit_transform(y_test_rl)

In [293]:
lm = LinearRegression()
lm.fit(X_train_rl,y_train_rl)
lm.score(X_train_rl,y_train_rl)

0.6383224347152039

In [294]:
print("Intercept :",lm.intercept_)
print("Coefficients :",lm.coef_)
print("R² du modèle :",round(lm.score(X_train_rl,y_train_rl),2))

Intercept : [1.92898986e-15]
Coefficients : [[-0.74414253 -0.78843823  0.12221086 -0.13017075  0.27829932 -0.36496766
   0.25575564  0.66021315 -0.01088267]]
R² du modèle : 0.64


In [295]:
def metrics_timeseries(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    diff = y_true - y_pred
    mae = np.mean(abs(diff))
    mse = np.mean(diff**2)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs(diff / y_true)) * 100
    dict_metrics = {"Métrique":["MAE", "MSE", "RMSE", "MAPE"], "Résultats":[mae, mse, rmse, mape]}
    df_metrics = pd.DataFrame(dict_metrics)
    return df_metrics

In [296]:
metrics_rl = metrics_timeseries(y_test_rl, pred_rl)
metrics_rl

Unnamed: 0,Métrique,Résultats
0,MAE,0.44274
1,MSE,0.367435
2,RMSE,0.606165
3,MAPE,292.679153


<div style="display:flex; justify-content: center; ">
<h2 style="margin:auto; padding: 20px; color: #8038DB">
Decision Tree 
</h2>
</div>

In [278]:
# Target
y = df3["median_house_value"]

# X
X = df3.drop("median_house_value", axis=1)

# Split 
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1)
model = DecisionTreeRegressor(random_state=1)

# Fit Model
model.fit(train_X, train_y)
model

# validation predictions and mean absolute error
val_predictions = model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 45,594


In [283]:
model.score(X,y)

0.9022093267270508