In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.cluster import KMeans
from sklearn.neighbors import BallTree
from catboost import CatBoostRegressor, Pool
import os

notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
csv_path = os.path.join(project_root, "Data", "xp.csv")
csv_path1 = os.path.join(project_root, "Data", "metro.csv")

df = pd.read_csv(csv_path)

metro_df = pd.read_csv(csv_path1)

metro_df.columns = metro_df.columns.str.strip()

In [43]:
df['date_mutation'] = pd.to_datetime(df['date_mutation'])

df['year'] = df['date_mutation'].dt.year
df['month'] = df['date_mutation'].dt.month
df['day_of_week'] = df['date_mutation'].dt.dayofweek
df['days_since_start'] = (df['date_mutation'] - df['date_mutation'].min()).dt.days

In [44]:
#Clusters
geo_features = df[['longitude', 'latitude']].copy()
n_clusters = 20

km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['geo_cluster'] = km.fit_predict(geo_features)

#Distance center
center_lon, center_lat = 2.3384444444444446, 48.86152777777778

df['dist_center'] =  np.sqrt((df['longitude'] - center_lon)**2 +
                             (df['latitude'] - center_lat) **2) * 111

In [45]:
metro_coords = np.radians(metro_df[['Latitude', 'Longitude']].values)
tree = BallTree(metro_coords, metric='haversine')

appart_coords = np.radians(df[['latitude', 'longitude']].values)

distances, indices = tree.query(appart_coords, k=1)
df['nearest_metro_dist_km'] = distances.flatten() * 6371

# Add nearest metro station info
df['nearest_metro_station'] = metro_df.iloc[indices.flatten()]['Libelle station'].values
df['nearest_metro_line'] = metro_df.iloc[indices.flatten()]['Libelle Line'].values

# Count metro stations within different radii
indices_300m = tree.query_radius(appart_coords, r=0.3/6371)
indices_500m = tree.query_radius(appart_coords, r=0.5/6371)

df['metro_count_300m'] = [len(idx) for idx in indices_300m]
df['metro_count_500m'] = [len(idx) for idx in indices_500m]
df['very_close_to_metro'] = (df['nearest_metro_dist_km'] < 0.1).astype(int)

In [46]:
df['surface_per_piece'] = df['surface_reelle_bati'] / df['nombre_pieces_principales'].replace(0, 1)
df['is_studio'] = (df['nombre_pieces_principales'] == 1).astype(int)
df['is_large'] = (df['nombre_pieces_principales'] >= 4).astype(int)

# Surface categories
df['surface_category'] = pd.cut(df['surface_reelle_bati'], 
                                  bins=[9, 40, 80, float('inf')],
                                  labels=['small', 'medium', 'large'])

In [47]:
# Sort by date
df_sorted = df.sort_values('date_mutation').reset_index(drop=True)

# Split: 80% train, 20% test
split_index = int(len(df_sorted) * 0.8)

train_df = df_sorted.iloc[:split_index].copy()
test_df = df_sorted.iloc[split_index:].copy()

In [48]:
station_stats_train = train_df.groupby('nearest_metro_station').agg({
    'surface_reelle_bati': ['mean', 'std', 'median', 'count'],
    'nombre_pieces_principales': ['mean', 'std', 'median'],
}).round(2)

# Flatten column names
station_stats_train.columns = [
    'station_avg_surface',
    'station_surface_std',
    'station_median_surface',
    'station_tx_count',
    'station_avg_rooms',
    'station_rooms_std',
    'station_median_rooms'
]

# Add derived features
station_stats_train['station_surface_range'] = (
    station_stats_train['station_avg_surface'] + 2*station_stats_train['station_surface_std'] -
    (station_stats_train['station_avg_surface'] - 2*station_stats_train['station_surface_std'])
)

# Replace zero std with 1 to avoid division by zero
station_stats_train['station_surface_std'] = station_stats_train['station_surface_std'].replace(0, 1)
station_stats_train['station_rooms_std'] = station_stats_train['station_rooms_std'].replace(0, 1)

In [49]:
# Merge to TRAIN (these are the same stations used to compute the stats)
train_df = train_df.merge(
    station_stats_train,
    left_on='nearest_metro_station',
    right_index=True,
    how='left'
)

# Merge to TEST (using the SAME stats from training!)
test_df = test_df.merge(
    station_stats_train,
    left_on='nearest_metro_station',
    right_index=True,
    how='left'
)

# Handle new stations in test that weren't in train
# Fill with overall training means
station_cols = [c for c in train_df.columns if c.startswith('station_')]

for col in station_cols:
    train_mean = train_df[col].mean()
    
    # Fill missing values in both train and test
    train_missing = train_df[col].isna().sum()
    test_missing = test_df[col].isna().sum()
    
    if train_missing > 0 or test_missing > 0:
        print(f"  {col}: filling {test_missing} missing values in test with train mean {train_mean:.2f}")
    
    train_df[col] = train_df[col].fillna(train_mean)
    test_df[col] = test_df[col].fillna(train_mean)


Handling missing values for new stations in test set...


In [50]:
for dataset_name, dataset in [('TRAIN', train_df), ('TEST', test_df)]:
    dataset['surface_vs_station_avg'] = (
        (dataset['surface_reelle_bati'] - dataset['station_avg_surface']) / 
        dataset['station_surface_std']
    )
    
    dataset['rooms_vs_station_avg'] = (
        (dataset['nombre_pieces_principales'] - dataset['station_avg_rooms']) / 
        dataset['station_rooms_std']
    )
    
    dataset['larger_than_station_median'] = (
        dataset['surface_reelle_bati'] > dataset['station_median_surface']
    ).astype(int)


In [51]:
train_agg = train_df.groupby(['code_postal', 'year', 'month']).agg({
    'price_per_sqrtm': 'median',
    'valeur_fonciere': 'median',
    'nombre_pieces_principales': 'median',
    'surface_reelle_bati': 'median',
    'surface_per_piece': 'median',
    'longitude': 'mean',
    'latitude': 'mean',
    'days_since_start': 'median',
    'geo_cluster': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
    'dist_center': 'median',
    'nearest_metro_dist_km': 'median',
    'station_tx_count': 'median',
    'station_avg_surface': 'median',
    'station_surface_std': 'median',
    'station_median_surface': 'median',
    'station_avg_rooms': 'median',
    'station_rooms_std': 'median',
    'station_median_rooms': 'median',
    'station_surface_range': 'median',
    'surface_vs_station_avg': 'median',
    'rooms_vs_station_avg': 'median',
    'larger_than_station_median': 'median',
    'metro_count_300m': 'median',
    'metro_count_500m': 'median',
    'very_close_to_metro': 'median',
    'is_studio': 'median',
    'is_large': 'median',
    'date_mutation': 'first'
}).reset_index()

print(f"Train aggregated shape: {train_agg.shape}")

Train aggregated shape: (837, 31)


In [52]:

test_agg = test_df.groupby(['code_postal', 'year', 'month']).agg({
    'price_per_sqrtm': 'median',
    'valeur_fonciere': 'median',
    'nombre_pieces_principales': 'median',
    'surface_reelle_bati': 'median',
    'surface_per_piece': 'median',
    'longitude': 'mean',
    'latitude': 'mean',
    'days_since_start': 'median',
    'geo_cluster': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
    'dist_center': 'median',
    'nearest_metro_dist_km': 'median',
    'station_tx_count': 'median',
    'station_avg_surface': 'median',
    'station_surface_std': 'median',
    'station_median_surface': 'median',
    'station_avg_rooms': 'median',
    'station_rooms_std': 'median',
    'station_median_rooms': 'median',
    'station_surface_range': 'median',
    'surface_vs_station_avg': 'median',
    'rooms_vs_station_avg': 'median',
    'larger_than_station_median': 'median',
    'metro_count_300m': 'median',
    'metro_count_500m': 'median',
    'very_close_to_metro': 'median',
    'is_studio': 'median',
    'is_large': 'median',
    'date_mutation': 'first'
}).reset_index()

In [53]:
# Count transactions per postal code in TRAINING period only
train_postal_counts = train_df.groupby('code_postal').size().to_dict()

# Apply to both train and test
train_agg['historical_tx_count'] = train_agg['code_postal'].map(train_postal_counts)
test_agg['historical_tx_count'] = test_agg['code_postal'].map(train_postal_counts).fillna(0)

In [54]:
# Define target and features
target = 'price_per_sqrtm'
drop_cols = ['price_per_sqrtm', 'valeur_fonciere', 'date_mutation']

# Split features and target
x_train = train_agg.drop(columns=drop_cols)
y_train = train_agg[target]

x_test = test_agg.drop(columns=drop_cols)
y_test = test_agg[target]

categorical_features = [
    'geo_cluster',
    'year',
    'month'
]

# Get categorical feature indices
cat_feature_indices = [x_train.columns.get_loc(cat) for cat in categorical_features if cat in x_train.columns]

In [55]:
baseline_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50
)

# Train the model
print("\nTraining baseline CatBoost model...")
baseline_model.fit(
    x_train,
    y_train,
    cat_features=cat_feature_indices,
    eval_set=(x_test, y_test),
    use_best_model=True
)

# Make predictions
y_train_pred_base = baseline_model.predict(x_train)
y_test_pred_base = baseline_model.predict(x_test)

# Calculate metrics
baseline_results = {
    'train_r2': r2_score(y_train, y_train_pred_base),
    'train_mae': mean_absolute_error(y_train, y_train_pred_base),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred_base)),
    'test_r2': r2_score(y_test, y_test_pred_base),
    'test_mae': mean_absolute_error(y_test, y_test_pred_base),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred_base))
}

print(f"  R² Score: {baseline_results['train_r2']:.4f}")
print(f"  MAE: {baseline_results['train_mae']:.2f} €/m²")
print(f"  RMSE: {baseline_results['train_rmse']:.2f} €/m²")
print(f"\nTest Set:")
print(f"  R² Score: {baseline_results['test_r2']:.4f}")
print(f"  MAE: {baseline_results['test_mae']:.2f} €/m²")
print(f"  RMSE: {baseline_results['test_rmse']:.2f} €/m²")


Training baseline CatBoost model...
0:	learn: 1479.7489188	test: 1726.6284862	best: 1726.6284862 (0)	total: 46.9ms	remaining: 46.8s
100:	learn: 294.6180958	test: 534.6330248	best: 526.0159043 (71)	total: 5.5s	remaining: 48.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 526.0159043
bestIteration = 71

Shrink model to first 72 iterations.
  R² Score: 0.9553
  MAE: 246.67 €/m²
  RMSE: 339.42 €/m²

Test Set:
  R² Score: 0.9018
  MAE: 346.38 €/m²
  RMSE: 526.02 €/m²


In [56]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter grid for CatBoost
param_distributions = {
    'iterations': [1000],
    'learning_rate': [0.01],
    'depth': [6],
    'l2_leaf_reg': [3],
}

#iterations=1000,
#learning_rate=0.1,
#depth=6,
#l2_leaf_reg=3,
#random_seed=42,
#verbose=100,
#early_stopping_rounds=50



print("\nStarting hyperparameter tuning...")
print("This may take a while...\n")

# Create time series split
tscv = TimeSeriesSplit(n_splits=3)

# Create base model
base_catboost = CatBoostRegressor(
    random_seed=42,
    verbose=0,
    early_stopping_rounds=50
)

# Randomized search
random_search = RandomizedSearchCV(
    estimator=base_catboost,
    param_distributions=param_distributions,
    n_iter=20,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit (note: CatBoost in sklearn doesn't support cat_features in GridSearchCV)
# We'll need to convert categorical features to category dtype
x_train_cat = x_train.copy()
x_test_cat = x_test.copy()

for cat_feat in categorical_features:
    if cat_feat in x_train_cat.columns:
        x_train_cat[cat_feat] = x_train_cat[cat_feat].astype(str).astype('category')
        x_test_cat[cat_feat] = x_test_cat[cat_feat].astype(str).astype('category')

random_search.fit(x_train_cat, y_train, cat_features=categorical_features)


Starting hyperparameter tuning...
This may take a while...

Fitting 3 folds for each of 1 candidates, totalling 3 fits



The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.



In [57]:
best_params = random_search.best_params_

# Train optimized model
optimized_model = CatBoostRegressor(
    **best_params,
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50
)

print("\nTraining optimized CatBoost model with best parameters...")
optimized_model.fit(
    x_train,
    y_train,
    cat_features=cat_feature_indices,
    eval_set=(x_test, y_test),
    use_best_model=True
)

# Make predictions
y_train_pred_opt = optimized_model.predict(x_train)
y_test_pred_opt = optimized_model.predict(x_test)

# Calculate metrics
optimized_results = {
    'train_r2': r2_score(y_train, y_train_pred_opt),
    'train_mae': mean_absolute_error(y_train, y_train_pred_opt),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred_opt)),
    'test_r2': r2_score(y_test, y_test_pred_opt),
    'test_mae': mean_absolute_error(y_test, y_test_pred_opt),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred_opt))
}

print(f"  R² Score: {optimized_results['train_r2']:.4f}")
print(f"  MAE: {optimized_results['train_mae']:.2f} €/m²")
print(f"  RMSE: {optimized_results['train_rmse']:.2f} €/m²")
print(f"\nTest Set:")
print(f"  R² Score: {optimized_results['test_r2']:.4f}")
print(f"  MAE: {optimized_results['test_mae']:.2f} €/m²")
print(f"  RMSE: {optimized_results['test_rmse']:.2f} €/m²")


Training optimized CatBoost model with best parameters...
0:	learn: 1592.0896591	test: 1832.1204345	best: 1832.1204345 (0)	total: 84.3ms	remaining: 1m 24s
100:	learn: 832.2618628	test: 1065.2591066	best: 1065.2591066 (100)	total: 5.86s	remaining: 52.1s
200:	learn: 548.5249221	test: 731.7185142	best: 731.7185142 (200)	total: 11.6s	remaining: 46.2s
300:	learn: 442.2971159	test: 595.9189852	best: 595.9189852 (300)	total: 17.7s	remaining: 41s
400:	learn: 397.3623601	test: 546.0518985	best: 546.0518985 (400)	total: 23.7s	remaining: 35.4s
500:	learn: 373.0041045	test: 529.0939326	best: 529.0939326 (500)	total: 29.2s	remaining: 29.1s
600:	learn: 355.4193471	test: 521.7249000	best: 521.7218380 (599)	total: 35.4s	remaining: 23.5s
700:	learn: 340.7314219	test: 518.2007283	best: 518.1658543 (699)	total: 43.1s	remaining: 18.4s
800:	learn: 326.9460266	test: 515.9155580	best: 515.8890862 (799)	total: 50.4s	remaining: 12.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 515.447163

In [58]:
#Vis
results_df = pd.DataFrame({
    'actual_prix_m2': y_test,
    'predicted_prix_m2': y_test_pred_opt,
    'code_postal': x_test['code_postal'].values,
    'year': x_test['year'].values,
    'month': x_test['month'].values
})

#Vis pred vs actual
fig = px.scatter(results_df, x='actual_prix_m2', y='predicted_prix_m2',
                  hover_data=['code_postal', 'year', 'month'],
                  title='Predicted vs Actual Price per m²',
                  labels={
                       'predicted_prix_m2': 'Prix moyen €/m² prédit', 
                       'actual_prix_m2': 'Prix moyen €/m² réel'
                   })
fig.add_scatter(x=[y_test.min(), y_test.max()], 
                 y=[y_test.min(), y_test.max()],
                 mode='lines', name='Perfect Prediction', 
                 line=dict(dash='dash', color='red'))
fig.show()

In [59]:
feature_importance = pd.DataFrame({
    'feature': x_train.columns,
    'importance': optimized_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20))

# Visualize feature importance
fig = px.bar(
    feature_importance.head(20),
    x='importance',
    y='feature',
    orientation='h',
    title='Top 20 Feature Importance (CatBoost)',
    labels={'importance': 'Importance Score', 'feature': 'Feature'}
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()


Top 20 Most Important Features:
                   feature  importance
0              code_postal   22.779343
28     historical_tx_count   22.149452
8         days_since_start    7.879650
7                 latitude    7.726464
12        station_tx_count    5.213933
19   station_surface_range    5.166766
14     station_surface_std    4.945656
6                longitude    3.334591
1                     year    2.971241
16       station_avg_rooms    2.607679
9              geo_cluster    2.543887
17       station_rooms_std    2.475043
13     station_avg_surface    2.261811
15  station_median_surface    1.736134
11   nearest_metro_dist_km    0.948837
24        metro_count_500m    0.931620
10             dist_center    0.882596
20  surface_vs_station_avg    0.590693
5        surface_per_piece    0.556054
21    rooms_vs_station_avg    0.485234


In [60]:
feature_importance_df = (
    feature_importance
    .sort_values(by="importance", ascending=False)
    .reset_index(drop=True)
)

feature_importance_df.head()

Unnamed: 0,feature,importance
0,code_postal,22.779343
1,historical_tx_count,22.149452
2,days_since_start,7.87965
3,latitude,7.726464
4,station_tx_count,5.213933


In [61]:
# Save model and preprocessing artifacts
import pickle

# Save the trained model
with open("../src/model_cat.pkl", "wb") as f:
    pickle.dump(optimized_model, f)

# Save preprocessing information needed for inference
preprocessing_info = {
    'kmeans_model': km,  # KMeans for geo clustering
    'center_coords': (2.3384444444444446, 48.86152777777778),  # Paris center
    'station_stats': station_stats_train,  # Station statistics from training
    'categorical_features': categorical_features,  # List of categorical features
    'cat_feature_indices': cat_feature_indices,  # Indices for CatBoost
    'feature_columns': list(x_train.columns),  # Column order
    'drop_cols': drop_cols  # Columns to drop
}

with open("../src/preprocessor_cat.pkl", "wb") as f:
    pickle.dump(preprocessing_info, f)

print("Model and preprocessor saved successfully!")
print(f"Preprocessing info keys: {list(preprocessing_info.keys())}")


Model and preprocessor saved successfully!
Preprocessing info keys: ['kmeans_model', 'center_coords', 'station_stats', 'categorical_features', 'cat_feature_indices', 'feature_columns', 'drop_cols']


In [62]:
import sklearn, pandas, numpy
print(sklearn.__version__, pandas.__version__, numpy.__version__)

1.6.1 2.2.3 2.1.3
