# Notebook modèle

In [7]:
#!pip install geopy
import pandas as pd
import seaborn as sns
from pandas.plotting import scatter_matrix
from geopy.distance import geodesic
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet,Lasso
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler,PowerTransformer,Normalizer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline
import joblib

df = pd.read_csv('housing.csv')
df=df.drop('Unnamed: 0',axis=1)
df = df.dropna(subset=['total_bedrooms'])
df.drop(df[df['households'] > df['population']].index, inplace=True)
df['is_inland'] = (df['ocean_proximity'] == 'INLAND').astype(int)
df.drop(['ocean_proximity'], axis=1, inplace=True)
df_clean=df
def distance_to_location(row, location_center):
    point = (row['latitude'], row['longitude'])
    return geodesic(location_center, point).kilometers

def calculate_distances(df):
    sf_center = (37.7749, -122.4194)
    la_center = (34.0522, -118.2437)
    sd_center = (32.7157, -117.1611)
    lb_center = (33.7701, -118.1937)
    ana_center= (33.8366, -117.9143)

    df['distance_to_sf'] = df.apply(distance_to_location, args=(sf_center,), axis=1)
    df['distance_to_la'] = df.apply(distance_to_location, args=(la_center,), axis=1)
    df['distance_to_sd'] = df.apply(distance_to_location, args=(sd_center,), axis=1)
    df['distance_to_lb'] = df.apply(distance_to_location, args=(lb_center,), axis=1)  

calculate_distances(df_clean)

seuil_sf = 100  
seuil_lb = 200  
seuil_la = 200  
seuil_sd = 300  


df_clean['centre_ville'] = 0

df_clean.loc[(df_clean['distance_to_sf'] < seuil_sf) | 
       (df_clean['distance_to_lb'] < seuil_lb) | 
       (df_clean['distance_to_la'] < seuil_la) | 
       (df_clean['distance_to_sd'] < seuil_sd), 'centre_ville'] = 1



In [21]:

X = df_clean.drop('median_house_value', axis=1)
y = df_clean['median_house_value']


scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

selected_features = ['distance_to_sf','distance_to_la','distance_to_sd','housing_median_age','latitude','longitude','centre_ville','population', 'median_income', 'is_inland','total_bedrooms','total_rooms','households']

X_train = X_train[selected_features]
X_test = X_test[selected_features]

X_train_scaled_standard = scaler.fit_transform(X_train)
X_test_scaled_standard = scaler.transform(X_test)

pipeline = Pipeline([
    ('select_features', SelectKBest(score_func=f_regression)),
    ('knn', KNeighborsRegressor())
])

# Définir la grille des hyperparamètres à rechercher
param_grid = {
    'select_features__k': [7, 8,10,12,13],  
    'knn__n_neighbors': [5, 7, 9, 12,15,20],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled_standard, y_train)

print("Meilleurs hyperparamètres trouvés:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled_standard)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print("Résultats avec le meilleur modèle:")
print("RMSE:", rmse_best)
print("Coefficient de détermination (R^2) :", r2_best)

# Exporter le meilleur modèle trouvé en utilisant joblib
joblib.dump(best_model, 'meilleur_modele.joblib')



Meilleurs hyperparamètres trouvés: {'knn__metric': 'manhattan', 'knn__n_neighbors': 9, 'knn__weights': 'distance', 'select_features__k': 13}
Résultats avec le meilleur modèle:
RMSE: 55429.49943891554
Coefficient de détermination (R^2) : 0.7733046882717974
