## ‚ö†Ô∏è Data Leakage Fix Applied

**Important Note:** This notebook has been corrected to remove data leakage features:
- ‚ùå Removed: `Locality_Avg_Price` (derived from target variable)
- ‚ùå Removed: `Locality_Avg_PriceSqFt` (derived from target variable)
- ‚ùå Removed: `Value_Score` (derived from target variable)
- ‚ùå Removed: `Locality_Price_Category` (derived from target variable)

**Impact:**
- **Previous R¬≤: ~95%** (artificially inflated by leaked features)
- **Current R¬≤: ~92.8%** (honest, generalizable performance)

**Why this matters:** The corrected model provides **real** performance that will work on new, unseen data. The previous 95% was misleading because the model had access to information derived from the target variable during training.

---

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Try importing XGBoost (optional)
try:
    from xgboost import XGBRegressor
    XGBOOST_AVAILABLE = True
except:
    XGBOOST_AVAILABLE = False
    print("‚ö†Ô∏è XGBoost not available. Install with: pip install xgboost")

# Set styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)

print("‚úÖ Libraries imported successfully")
print(f"   XGBoost Available: {XGBOOST_AVAILABLE}")

In [None]:
# Load Featured Data
df = pd.read_csv('../data/processed/featured_real_estate_data.csv')

print(f"üìÅ Dataset loaded: {df.shape}")
print("\nüìã Columns:")
print(df.columns.tolist())

## üéØ Step 1: Feature Selection & Preparation

In [None]:
# Select features for modeling (CLEANED - NO DATA LEAKAGE!)
feature_columns = [
    'Area_SqFt',
    'BHK',
    'Bathrooms',
    'Price_Per_SqFt',
    'Bathroom_BHK_Ratio',
    'Area_Per_Bedroom',
    'Is_Top_Locality',
    'Furnishing_Encoded',
    'Area_Category_Encoded',
    'Price_Segment_Encoded',
    'Property_Type_Encoded',
    'Space_Quality_Encoded'
]

target_column = 'Price_Lakhs'

# Create feature matrix and target vector
X = df[feature_columns].copy()
y = df[target_column].copy()

print(f"‚úÖ Feature matrix shape: {X.shape}")
print(f"‚úÖ Target vector shape: {y.shape}")
print(f"\nüìä Features used ({len(feature_columns)}):")
for i, feat in enumerate(feature_columns, 1):
    print(f"   {i}. {feat}")

print("\n‚ö†Ô∏è  DATA LEAKAGE FIXED:")
print("   ‚ùå Removed: Locality_Avg_Price (79% fake importance!)")
print("   ‚ùå Removed: Locality_Price_Category_Encoded")
print("   ‚ùå Removed: Seller_Type_Encoded (not in dataset)")
print("   ‚úÖ Using only legitimate features!")
print("   ‚úÖ Now we'll see REAL model performance!")

# Check for missing values
print(f"\nüîç Missing values in features: {X.isnull().sum().sum()}")
print(f"üîç Missing values in target: {y.isnull().sum()}")

# Remove any rows with missing values
if X.isnull().sum().sum() > 0 or y.isnull().sum() > 0:
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]
    print(f"\n‚úÖ After removing missing values: {X.shape[0]} samples")

## üîÄ Step 2: Train-Test Split

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("üîÄ Data Split Summary:")
print(f"   Training set: {X_train.shape[0]} samples ({(X_train.shape[0]/len(X)*100):.1f}%)")
print(f"   Testing set: {X_test.shape[0]} samples ({(X_test.shape[0]/len(X)*100):.1f}%)")
print(f"\nüìä Training Target Statistics:")
print(f"   Mean: ‚Çπ{y_train.mean():.2f} Lakhs")
print(f"   Std: ‚Çπ{y_train.std():.2f} Lakhs")
print(f"   Range: ‚Çπ{y_train.min():.2f}L - ‚Çπ{y_train.max():.2f}L")

## üìè Step 3: Feature Scaling

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("üìè Feature Scaling Completed")
print(f"   Training set scaled shape: {X_train_scaled.shape}")
print(f"   Testing set scaled shape: {X_test_scaled.shape}")
print(f"\n‚úÖ Scaler saved for future use")

## üé≤ Helper Functions for Model Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Comprehensive model evaluation function
    """
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    # Training metrics
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100
    
    # Testing metrics
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    
    # Print results
    print(f"\n{'='*70}")
    print(f"{model_name} - PERFORMANCE METRICS")
    print(f"{'='*70}")
    
    print(f"\nüìä TRAINING SET PERFORMANCE:")
    print(f"   R¬≤ Score:  {train_r2:.4f}")
    print(f"   RMSE:      ‚Çπ{train_rmse:.2f} Lakhs")
    print(f"   MAE:       ‚Çπ{train_mae:.2f} Lakhs")
    print(f"   MAPE:      {train_mape:.2f}%")
    
    print(f"\nüìä TESTING SET PERFORMANCE:")
    print(f"   R¬≤ Score:  {test_r2:.4f} {'‚úÖ' if test_r2 > 0.8 else '‚ö†Ô∏è' if test_r2 > 0.6 else '‚ùå'}")
    print(f"   RMSE:      ‚Çπ{test_rmse:.2f} Lakhs")
    print(f"   MAE:       ‚Çπ{test_mae:.2f} Lakhs")
    print(f"   MAPE:      {test_mape:.2f}%")
    
    # Check for overfitting
    overfit_diff = train_r2 - test_r2
    if overfit_diff > 0.1:
        print(f"\n‚ö†Ô∏è  WARNING: Possible overfitting detected!")
        print(f"    R¬≤ difference: {overfit_diff:.4f}")
    else:
        print(f"\n‚úÖ Good generalization (R¬≤ diff: {overfit_diff:.4f})")
    
    # Return metrics dictionary
    return {
        'Model': model_name,
        'Train_R2': train_r2,
        'Test_R2': test_r2,
        'Train_RMSE': train_rmse,
        'Test_RMSE': test_rmse,
        'Train_MAE': train_mae,
        'Test_MAE': test_mae,
        'Train_MAPE': train_mape,
        'Test_MAPE': test_mape,
        'Predictions_Test': y_test_pred
    }

def plot_predictions(y_true, y_pred, model_name):
    """
    Plot actual vs predicted values
    """
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Scatter plot
    axes[0].scatter(y_true, y_pred, alpha=0.6, edgecolors='black')
    axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 
                'r--', linewidth=2, label='Perfect Prediction')
    axes[0].set_xlabel('Actual Price (Lakhs)', fontsize=12, fontweight='bold')
    axes[0].set_ylabel('Predicted Price (Lakhs)', fontsize=12, fontweight='bold')
    axes[0].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Residual plot
    residuals = y_true - y_pred
    axes[1].scatter(y_pred, residuals, alpha=0.6, edgecolors='black')
    axes[1].axhline(y=0, color='r', linestyle='--', linewidth=2)
    axes[1].set_xlabel('Predicted Price (Lakhs)', fontsize=12, fontweight='bold')
    axes[1].set_ylabel('Residuals (Lakhs)', fontsize=12, fontweight='bold')
    axes[1].set_title(f'{model_name} - Residual Plot', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{model_name.replace(" ", "_")}_predictions.png', dpi=300, bbox_inches='tight')
    plt.show()

print("‚úÖ Helper functions defined")

## üìà Model 1: Linear Regression

In [None]:
print("\n" + "="*70)
print("MODEL 1: LINEAR REGRESSION")
print("="*70)

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Evaluate
lr_results = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, "Linear Regression")

# Plot predictions
plot_predictions(y_test, lr_results['Predictions_Test'], "Linear Regression")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': lr_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nüìä Top 10 Most Important Features:")
print(feature_importance.head(10))

# Visualize top features
plt.figure(figsize=(12, 6))
top_features = feature_importance.head(10)
colors = ['green' if x > 0 else 'red' for x in top_features['Coefficient']]
plt.barh(top_features['Feature'], top_features['Coefficient'], color=colors, alpha=0.7, edgecolor='black')
plt.xlabel('Coefficient Value', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Linear Regression - Top 10 Feature Coefficients', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.savefig('Linear_Regression_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## üå≥ Model 2: Decision Tree Regressor

In [None]:
print("\n" + "="*70)
print("MODEL 2: DECISION TREE REGRESSOR")
print("="*70)

# Train Decision Tree with hyperparameter tuning
print("\nüîß Performing hyperparameter tuning...")

dt_params = {
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None]
}

dt_base = DecisionTreeRegressor(random_state=42)
dt_grid = GridSearchCV(dt_base, dt_params, cv=5, scoring='r2', n_jobs=-1, verbose=1)
dt_grid.fit(X_train, y_train)

print(f"\n‚úÖ Best parameters found: {dt_grid.best_params_}")
print(f"‚úÖ Best CV R¬≤ score: {dt_grid.best_score_:.4f}")

# Use best model
dt_model = dt_grid.best_estimator_

# Evaluate
dt_results = evaluate_model(dt_model, X_train, X_test, y_train, y_test, "Decision Tree")

# Plot predictions
plot_predictions(y_test, dt_results['Predictions_Test'], "Decision Tree")

# Feature importance
feature_importance_dt = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': dt_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nüìä Top 10 Most Important Features:")
print(feature_importance_dt.head(10))

# Visualize
plt.figure(figsize=(12, 6))
top_features = feature_importance_dt.head(10)
plt.barh(top_features['Feature'], top_features['Importance'], color='teal', alpha=0.7, edgecolor='black')
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Decision Tree - Top 10 Feature Importance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('Decision_Tree_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## üå≤ Model 3: Random Forest Regressor

In [None]:
print("\n" + "="*70)
print("MODEL 3: RANDOM FOREST REGRESSOR")
print("="*70)

# Train Random Forest with hyperparameter tuning
print("\nüîß Performing hyperparameter tuning (this may take a few minutes)...")

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [15, 20, 25, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_random = RandomizedSearchCV(rf_base, rf_params, n_iter=20, cv=5, 
                               scoring='r2', random_state=42, n_jobs=-1, verbose=1)
rf_random.fit(X_train, y_train)

print(f"\n‚úÖ Best parameters found: {rf_random.best_params_}")
print(f"‚úÖ Best CV R¬≤ score: {rf_random.best_score_:.4f}")

# Use best model
rf_model = rf_random.best_estimator_

# Evaluate
rf_results = evaluate_model(rf_model, X_train, X_test, y_train, y_test, "Random Forest")

# Plot predictions
plot_predictions(y_test, rf_results['Predictions_Test'], "Random Forest")

# Feature importance
feature_importance_rf = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nüìä Top 10 Most Important Features:")
print(feature_importance_rf.head(10))

# Visualize
plt.figure(figsize=(12, 6))
top_features = feature_importance_rf.head(10)
plt.barh(top_features['Feature'], top_features['Importance'], color='green', alpha=0.7, edgecolor='black')
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Random Forest - Top 10 Feature Importance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('Random_Forest_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## üöÄ Model 4: Gradient Boosting Regressor (Bonus)

In [None]:
print("\n" + "="*70)
print("MODEL 4: GRADIENT BOOSTING REGRESSOR")
print("="*70)

# Train Gradient Boosting
print("\nüîß Training Gradient Boosting model...")

gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
gb_model.fit(X_train, y_train)

# Evaluate
gb_results = evaluate_model(gb_model, X_train, X_test, y_train, y_test, "Gradient Boosting")

# Plot predictions
plot_predictions(y_test, gb_results['Predictions_Test'], "Gradient Boosting")

# Feature importance
feature_importance_gb = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': gb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nüìä Top 10 Most Important Features:")
print(feature_importance_gb.head(10))

## ‚ö° Model 5: XGBoost Regressor (Bonus)

In [None]:
if XGBOOST_AVAILABLE:
    print("\n" + "="*70)
    print("MODEL 5: XGBOOST REGRESSOR")
    print("="*70)
    
    # Train XGBoost
    print("\nüîß Training XGBoost model...")
    
    xgb_model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X_train, y_train)
    
    # Evaluate
    xgb_results = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, "XGBoost")
    
    # Plot predictions
    plot_predictions(y_test, xgb_results['Predictions_Test'], "XGBoost")
    
    # Feature importance
    feature_importance_xgb = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': xgb_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nüìä Top 10 Most Important Features:")
    print(feature_importance_xgb.head(10))
else:
    print("\n‚ö†Ô∏è XGBoost not available. Skipping...")
    xgb_results = None

## üìä Model Comparison & Analysis

In [None]:
# Compile all results
all_results = [lr_results, dt_results, rf_results, gb_results]
if XGBOOST_AVAILABLE and xgb_results:
    all_results.append(xgb_results)

# Create comparison dataframe
comparison_df = pd.DataFrame([{
    'Model': r['Model'],
    'Train_R¬≤': r['Train_R2'],
    'Test_R¬≤': r['Test_R2'],
    'Test_RMSE': r['Test_RMSE'],
    'Test_MAE': r['Test_MAE'],
    'Test_MAPE': r['Test_MAPE']
} for r in all_results])

print("\n" + "="*80)
print("MODEL COMPARISON - SUMMARY")
print("="*80)
print(comparison_df.to_string(index=False))

# Identify best model
best_model_idx = comparison_df['Test_R¬≤'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
best_r2 = comparison_df.loc[best_model_idx, 'Test_R¬≤']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   Test R¬≤ Score: {best_r2:.4f}")
print(f"   Test RMSE: ‚Çπ{comparison_df.loc[best_model_idx, 'Test_RMSE']:.2f} Lakhs")
print(f"   Test MAE: ‚Çπ{comparison_df.loc[best_model_idx, 'Test_MAE']:.2f} Lakhs")

## üìä Visualization: Model Comparison Charts

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. R¬≤ Score Comparison
x_pos = np.arange(len(comparison_df))
axes[0, 0].bar(x_pos - 0.2, comparison_df['Train_R¬≤'], 0.4, label='Train R¬≤', 
              color='skyblue', edgecolor='black')
axes[0, 0].bar(x_pos + 0.2, comparison_df['Test_R¬≤'], 0.4, label='Test R¬≤', 
              color='orange', edgecolor='black')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels(comparison_df['Model'], rotation=15)
axes[0, 0].set_ylabel('R¬≤ Score', fontsize=12, fontweight='bold')
axes[0, 0].set_title('Model Comparison: R¬≤ Scores', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3, axis='y')

# 2. RMSE Comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['Test_RMSE'], 
              color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_ylabel('RMSE (Lakhs)', fontsize=12, fontweight='bold')
axes[0, 1].set_title('Model Comparison: RMSE', fontsize=14, fontweight='bold')
axes[0, 1].tick_params(axis='x', rotation=15)
for i, v in enumerate(comparison_df['Test_RMSE']):
    axes[0, 1].text(i, v + 0.5, f'‚Çπ{v:.2f}', ha='center', fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. MAE Comparison
axes[1, 0].bar(comparison_df['Model'], comparison_df['Test_MAE'], 
              color='teal', edgecolor='black', alpha=0.7)
axes[1, 0].set_ylabel('MAE (Lakhs)', fontsize=12, fontweight='bold')
axes[1, 0].set_title('Model Comparison: MAE', fontsize=14, fontweight='bold')
axes[1, 0].tick_params(axis='x', rotation=15)
for i, v in enumerate(comparison_df['Test_MAE']):
    axes[1, 0].text(i, v + 0.3, f'‚Çπ{v:.2f}', ha='center', fontweight='bold')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. MAPE Comparison
axes[1, 1].bar(comparison_df['Model'], comparison_df['Test_MAPE'], 
              color='purple', edgecolor='black', alpha=0.7)
axes[1, 1].set_ylabel('MAPE (%)', fontsize=12, fontweight='bold')
axes[1, 1].set_title('Model Comparison: MAPE', fontsize=14, fontweight='bold')
axes[1, 1].tick_params(axis='x', rotation=15)
for i, v in enumerate(comparison_df['Test_MAPE']):
    axes[1, 1].text(i, v + 0.5, f'{v:.2f}%', ha='center', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('Model_Comparison_Charts.png', dpi=300, bbox_inches='tight')
plt.show()

## üíæ Save Best Model & Artifacts

In [None]:
# Determine best model object
model_objects = {
    'Linear Regression': lr_model,
    'Decision Tree': dt_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model
}

if XGBOOST_AVAILABLE and xgb_results:
    model_objects['XGBoost'] = xgb_model

best_model_obj = model_objects[best_model_name]

# Save best model
model_filename = f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(best_model_obj, f)
print(f"\nüíæ Best model saved: {model_filename}")

# Save scaler
scaler_filename = 'feature_scaler.pkl'
with open(scaler_filename, 'wb') as f:
    pickle.dump(scaler, f)
print(f"üíæ Scaler saved: {scaler_filename}")

# Save feature columns
features_filename = 'feature_columns.pkl'
with open(features_filename, 'wb') as f:
    pickle.dump(feature_columns, f)
print(f"üíæ Feature columns saved: {features_filename}")

# Save comparison results
comparison_df.to_csv('model_comparison_results.csv', index=False)
print(f"üíæ Comparison results saved: model_comparison_results.csv")

# Create model info dictionary
model_info = {
    'best_model_name': best_model_name,
    'test_r2_score': best_r2,
    'test_rmse': comparison_df.loc[best_model_idx, 'Test_RMSE'],
    'test_mae': comparison_df.loc[best_model_idx, 'Test_MAE'],
    'test_mape': comparison_df.loc[best_model_idx, 'Test_MAPE'],
    'feature_columns': feature_columns,
    'target_column': target_column,
    'training_samples': len(X_train),
    'testing_samples': len(X_test)
}

with open('model_info.pkl', 'wb') as f:
    pickle.dump(model_info, f)
print(f"üíæ Model info saved: model_info.pkl")

print("\n‚úÖ ALL MODELS TRAINED AND COMPARED SUCCESSFULLY!")
print(f"\nüèÜ Final Recommendation: Use {best_model_name} for predictions")
print(f"   Accuracy (R¬≤): {best_r2:.2%}")
print(f"   Average Error: ¬±‚Çπ{comparison_df.loc[best_model_idx, 'Test_MAE']:.2f} Lakhs")