In [1]:
from sqlalchemy import create_engine
import pandas as pd
from sklearn.preprocessing import MinMaxScaler 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Database connection
db_url = 'postgresql://postgres:post@localhost:5432/m_l_m'
engine = create_engine(db_url)

In [3]:
# Import data
tables = ['annonce', 'city', 'equipment', 'annonce_equipment']
dataframes = {table: pd.read_sql_table(table, engine) for table in tables}

In [33]:
# Extract tables for analysis
ads = dataframes['annonce']
ads_equips = dataframes['annonce_equipment']

In [34]:
# Preprocessing
ads = ads[ads['price'] != 'PRIX NOT SPECIFIED']
ads['price'] = pd.to_numeric(ads['price'], errors='coerce')
ads.dropna(subset=['price'], inplace=True)

ads['year'] = pd.to_datetime(ads['datetime']).dt.year
ads['month'] = pd.to_datetime(ads['datetime']).dt.month
ads['day'] = pd.to_datetime(ads['datetime']).dt.day

# Encode categorical variables
ads = pd.get_dummies(ads, columns=['city_id'], drop_first=True)

# Create binary variables for equipment presence
ads_equips['equiped'] = 1
equip_binary = ads_equips.pivot(index='annonce_id', columns='equipment_id', values='equiped').fillna(0)
ads = ads.merge(equip_binary, left_on='id', right_index=True, how='left')
equip_binary.head()

equipment_id,1,2,3,4,5,6,7,8,9,10,...,1362,1363,1364,1365,1366,1367,1368,1369,1370,1371
annonce_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Normalize numerical variables using Min-Max Scaling
scaler = MinMaxScaler()  
numerical_vars = ['price', 'nb_rooms', 'nb_baths', 'surface_area']
ads[numerical_vars] = scaler.fit_transform(ads[numerical_vars])

In [None]:
# Correlation Analysis
corr_matrix = ads[numerical_vars].corr()

# Visualize Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Select numerical predictors for VIF
X = ads[['nb_rooms', 'nb_baths', 'surface_area', 'year', 'month']]  # Add more features if necessary
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

In [None]:
# Select independent variables (features) and target variable (price)
X = ads[['nb_rooms', 'nb_baths', 'surface_area'] + list(ads.filter(like='city_id_'))]
y = ads['price']

# Convert column names to string and ensure no hidden issues
X.columns = [str(col) for col in X.columns] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Ridge regression model
ridge_regressor = Ridge(alpha=1.0)  # You can adjust alpha for regularization strength
ridge_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_regressor.predict(X_test)

# Evaluate model performance
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# Visualize residuals
residuals = y_test - y_pred
plt.figure(figsize=(6, 4))
plt.scatter(y_test, residuals, color='#FF7200')
plt.axhline(y=0, color='#5C00FF', linestyle='--')
plt.xlabel('True Values')
plt.ylabel('Residuals')
plt.title('Residuals of Ridge Regression')
plt.show()

# Histogram of residuals
sns.histplot(residuals, kde=True, color='#00D4FF')
plt.title('Distribution of Residuals')
plt.show()

# Q-Q plot for normality of residuals
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals')
plt.show()


In [None]:
# Prepare data (removing the 'year' feature)
X = ads[['nb_rooms', 'nb_baths', 'surface_area'] + list(ads.filter(like='city_id_'))]
y = ads['price']

# Convert all column names to strings explicitly to avoid the 'quoted_name' and 'str' issue
X.columns = [str(col) for col in X.columns]

# Ensure all column types in X are numeric, and convert categorical booleans to 0 or 1 (if applicable)
X = X.apply(pd.to_numeric, errors='coerce')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models initialization
ridge_regressor = Ridge(alpha=1.0)
lasso_regressor = Lasso(alpha=1.0)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)

# List of models to compare
models = {
    "Ridge Regression": ridge_regressor,
    "Lasso Regression": lasso_regressor,
    "Random Forest": rf_regressor,
    "Gradient Boosting": gb_regressor
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Evaluate each model
results = {}
for model_name, model in models.items():
    mse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[model_name] = {"MSE": mse, "R2": r2}

# Print results
for model_name, result in results.items():
    print(f"{model_name} - MSE: {result['MSE']:.4f}, R²: {result['R2']:.4f}")


In [None]:
# Define the models and their hyperparameters
models = {
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Define the parameter grids
param_grids = {
    "Ridge Regression": {
        "alpha": [0.1, 1, 10, 100]
    },
    "Lasso Regression": {
        "alpha": [0.1, 1, 10, 100]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 150],
        "max_depth": [5, 10, 15],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 150],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    }
}

# Function to perform GridSearchCV and print the best results
def tune_model(model_name, model, param_grid, X_train, y_train):
    print(f"Tuning {model_name}...")
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best MSE for {model_name}: {-grid_search.best_score_}")
    
    return grid_search.best_estimator_

# Tuning the models
best_models = {}
for model_name in models:
    model = models[model_name]
    param_grid = param_grids[model_name]
    best_model = tune_model(model_name, model, param_grid, X_train, y_train)
    best_models[model_name] = best_model

# After tuning, evaluate the best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - MSE: {mse:.4f}, R²: {r2:.4f}")


In [None]:
# Predict using the Random Forest model
y_pred_rf = rf_regressor.predict(X_test)

# Calculate residuals
residuals_rf = y_test - y_pred_rf

# Plot Residuals vs True Values (Scatter Plot)
plt.figure(figsize=(6, 4))
plt.scatter(y_test, residuals_rf, color='#00FFFF')
plt.axhline(y=0, color='#2B00FF', linestyle='--')
plt.xlabel('True Values')
plt.ylabel('Residuals')
plt.title('Residuals of Random Forest')
plt.show()

# Plot Histogram of Residuals
sns.histplot(residuals_rf, kde=True, color='#FF0067')
plt.title('Distribution of Residuals')
plt.show()

# Q-Q Plot to check Normality of Residuals
stats.probplot(residuals_rf, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals')
plt.show()

In [None]:
# Predict on the test set using Gradient Boosting
y_pred_gb = gb_regressor.predict(X_test)

# Calculate residuals
residuals_gb = y_test - y_pred_gb

# Scatter plot of residuals
plt.figure(figsize=(6, 4))
plt.scatter(y_test, residuals_gb, color='#7200FF')
plt.axhline(y=0, color='#0019FF', linestyle='--')
plt.xlabel('True Values')
plt.ylabel('Residuals')
plt.title('Residuals of Gradient Boosting')
plt.show()

# Histogram of residuals for Gradient Boosting
sns.histplot(residuals_gb, kde=True, color='springgreen')
plt.title('Distribution of Residuals - Gradient Boosting')
plt.show()

# Q-Q plot for normality of residuals (Gradient Boosting)
stats.probplot(residuals_gb, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals - Gradient Boosting')
plt.show()

## Model Selection: Why Random Forest?

After evaluating multiple models (Ridge, Lasso, Random Forest, and Gradient Boosting), **Random Forest** was chosen based on its superior performance and robustness.

- **Performance:** Random Forest achieved the _**lowest MSE (0.0013)**_ and the _**highest R² (0.9418)**_, outperforming the other models.
- **Non-linearity:** Random Forest captures complex, non-linear relationships, which is beneficial for this dataset.
- **Robustness:** It is less prone to overfitting and handles noisy data well.
- **Feature Importance:** Random Forest helps identify key predictors, improving model interpretability.
- **Residuals:** The residuals were well-distributed around zero, indicating good generalization.

For these reasons, **Random Forest** was selected as the final model.

In [None]:
# Get feature importances
feature_importances = rf_regressor.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(7, 4))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='#7D00FF')
plt.title('Random Forest Feature Importances')
plt.xlabel('Importance')
plt.show()

## Feature Importance Interpretation

In the Random Forest model, the feature importance ranking shows that **'nb_rooms'** has the greatest influence on property price, followed by **'nb_baths'** and **'surface_area'**. This indicates:

- **Number of rooms** (*`'nb_rooms'`*) is the most significant factor in predicting price, with more rooms generally leading to higher property values.
- **Number of bathrooms** (*`'nb_baths'`*) and **surface area** (*`'surface_area'`*) have lesser importance but still contribute to price, with larger properties and more bathrooms being valued higher, though to a smaller extent than the number of rooms.

In [None]:
sns.residplot(x=y_test, y=y_pred_rf, lowess=True, color='cyan')
plt.title("Regression Residual Plot")
plt.xlabel("True Price")
plt.ylabel("Predicted Price")
plt.show()

In [None]:
# Classification Analysis
ads['has_elevator'] = ads[equip_binary.columns].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)
X_clf = ads[['nb_rooms', 'nb_baths', 'surface_area']]
y_clf = ads['has_elevator']
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

model_clf = RandomForestClassifier(random_state=42)
model_clf.fit(X_train_clf, y_train_clf)
y_pred_clf = model_clf.predict(X_test_clf)

In [None]:
# Evaluate Classification
print(classification_report(y_test_clf, y_pred_clf))
print("Classification ROC-AUC:", roc_auc_score(y_test_clf, model_clf.predict_proba(X_test_clf)[:, 1]))

In [None]:
fpr, tpr, _ = roc_curve(y_test_clf, model_clf.predict_proba(X_test_clf)[:, 1])
plt.plot(fpr, tpr, label=f"Random Forest (AUC = {roc_auc_score(y_test_clf, model_clf.predict_proba(X_test_clf)[:, 1]):.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## 2. Ethical Considerations

### 2.1. Bias in Data
- **Urban vs. Rural Bias**: If the dataset contains more advertisements from urban areas, the models may not generalize well to rural contexts.
- **Socioeconomic Factors**: Price predictions might inadvertently reflect societal inequalities (e.g., lower prices in certain neighborhoods due to historical biases).

#### Mitigation:
- Perform stratified sampling to ensure diverse representation in training data.
- Include additional variables like average income or amenities to account for contextual factors.

### 2.2. Transparency in Predictions
- **Explainable AI**: Use tools like SHAP or LIME to explain model predictions, ensuring end-users understand why a particular prediction was made.
- **Fairness in Classification**: Ensure the classification model does not disproportionately misclassify equipment presence in specific regions or demographics.

#### Mitigation:
- Perform fairness audits to evaluate disparate impact across groups.
- Regularly monitor model performance in deployment to detect biases.

### 2.3. Privacy Concerns
- Ensure the dataset does not include personally identifiable information (PII) about advertisers or users.
- If using geolocation data (e.g., city IDs), ensure it is anonymized.

#### Mitigation:
- Use pseudonymized or aggregated data when analyzing or deploying models.
- Comply with data protection regulations like GDPR or CCPA.

### 2.4. Ethical Use of Predictions
- Predicted prices could influence the real estate market (e.g., price hikes). Ensure stakeholders use predictions responsibly.
- Avoid using models for discriminatory practices, such as excluding certain demographics from targeted advertisements.

#### Mitigation:
- Clearly define and document acceptable use cases for the models.
- Collaborate with domain experts to align model outcomes with ethical standards.

---

## 3. Empirical Steps for Ethical Monitoring
1. **Bias Detection**:
   - Calculate fairness metrics like disparate impact ratio to evaluate bias.
   - Compare model performance across different subgroups (e.g., urban vs. rural).

2. **Explainability Tools**:
   - Implement SHAP for feature importance analysis:
     ```python
     import shap
     explainer = shap.Explainer(model, X_test)
     shap_values = explainer(X_test)
     shap.summary_plot(shap_values, X_test)
     ```

3. **Regular Audits**:
   - Set up a pipeline for periodic evaluation of deployed models to detect drift or new biases.

---

### Final Note:
By documenting the project comprehensively and addressing ethical considerations thoroughly, we ensure that the models are both robust and responsibly deployed. This approach fosters transparency, trust, and fairness in the system.